File size: 23,253 Bytes

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4821055b-c45c-4a9f-8196-2a9d09df6c39",
   "metadata": {},
   "source": [
    "# Orpheus Auto-Continuations Generator (ver. 1.0)\n",
    "\n",
    "***\n",
    "\n",
    "Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools\n",
    "\n",
    "***\n",
    "\n",
    "WARNING: This complete implementation is a functioning model of the Artificial Intelligence. Please excercise great humility, care, and respect. https://www.nscai.gov/\n",
    "\n",
    "***\n",
    "\n",
    "#### Project Los Angeles\n",
    "\n",
    "#### Tegridy Code 2025\n",
    "\n",
    "***"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6e2249a-6b57-4193-830d-7772c29b6f38",
   "metadata": {},
   "source": [
    "# Setup environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1de7766b-1df0-4281-9322-650068da2a2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "!git clone --depth 1 https://github.com/asigalov61/tegridy-tools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e9de3f7-4a3d-41d0-a6b6-1bc8fc98fa6e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "!pip install huggingface_hub\n",
    "!pip install hf-transfer\n",
    "\n",
    "!pip install ipywidgets\n",
    "!pip install tqdm\n",
    "\n",
    "!pip install einx\n",
    "!pip install einops\n",
    "!pip install torch-summary\n",
    "!pip install scikit-learn\n",
    "!pip install matplotlib"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68799e16-da90-4f1b-97c8-813bd5df665e",
   "metadata": {},
   "source": [
    "# Import modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6073f1b3-edca-49b1-bfed-2029a9efda35",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load modules and make data dir\n",
    "\n",
    "print('Loading modules...')\n",
    "\n",
    "import os\n",
    "\n",
    "os.environ[\"HF_HUB_ENABLE_HF_TRANSFER\"] = \"1\"\n",
    "\n",
    "import pickle\n",
    "import random\n",
    "import tqdm\n",
    "\n",
    "!set USE_FLASH_ATTENTION=1\n",
    "os.environ['USE_FLASH_ATTENTION'] = '1'\n",
    "\n",
    "import torch\n",
    "import numpy as np\n",
    "\n",
    "from torchsummary import summary\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "%cd /home/ubuntu/tegridy-tools/tegridy-tools/\n",
    "\n",
    "import TMIDIX\n",
    "\n",
    "%cd /home/ubuntu/tegridy-tools/tegridy-tools/X-Transformer\n",
    "\n",
    "from x_transformer_2_3_1 import *\n",
    "\n",
    "torch.set_float32_matmul_precision('high')\n",
    "torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul\n",
    "torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn\n",
    "torch.backends.cuda.enable_flash_sdp(True)\n",
    "torch.backends.cuda.enable_cudnn_sdp(False)\n",
    "\n",
    "!set USE_FLASH_ATTENTION=1\n",
    "\n",
    "%cd /home/ubuntu/\n",
    "\n",
    "import random\n",
    "\n",
    "from huggingface_hub import hf_hub_download\n",
    "\n",
    "print('Done')\n",
    "\n",
    "print('Torch version:', torch.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f94d805e-ac8a-400b-9e9c-a6ff572c4b80",
   "metadata": {},
   "source": [
    "# Download Orpheus model and Orpheus embeddings dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f8d75d4-982b-4a60-a234-afc71aa6dd84",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('=' * 70)\n",
    "print('Donwloading Orpheus Music Transformer model...')\n",
    "print('=' * 70)\n",
    "\n",
    "model_file = hf_hub_download(repo_id='asigalov61/Orpheus-Music-Transformer',\n",
    "                            filename='Orpheus_Music_Transformer_Trained_Model_128497_steps_0.6934_loss_0.7927_acc.pth',\n",
    "                            local_dir='/home/ubuntu/Models/',\n",
    "                            )\n",
    "\n",
    "\n",
    "print('=' * 70)\n",
    "print('Donwloading Orpheus embeddings dataset...')\n",
    "print('=' * 70)\n",
    "\n",
    "emb_file = hf_hub_download(repo_id='asigalov61/Orpheus-Music-Transformer',\n",
    "                           filename='orpheus_data/1765807_Orpheus_Training_Data_Reference_MP_Embeddings_CC_BY_NC_SA.npy',\n",
    "                           local_dir='/home/ubuntu/Models/',\n",
    "                          )\n",
    "\n",
    "print('=' * 70)\n",
    "print('Done!')\n",
    "print('=' * 70)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30147799-9bc5-4acd-8352-d5fe309bd844",
   "metadata": {},
   "source": [
    "# Load model and embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d95d650e-a6b6-4fca-bc3d-75bd1c042c06",
   "metadata": {},
   "outputs": [],
   "source": [
    "#=================================================================\n",
    "\n",
    "def get_embeddings(inputs):\n",
    "    \n",
    "    with ctx:\n",
    "        with torch.no_grad():\n",
    "            out = model(inputs, return_outputs=True)\n",
    "    \n",
    "    cache = out[3]\n",
    "\n",
    "    hidden = cache.layer_hiddens[-1]\n",
    "    \n",
    "    mean_pool   = torch.mean(hidden, dim=1)\n",
    "    \n",
    "    return mean_pool.cpu().detach().numpy()\n",
    "\n",
    "#=================================================================\n",
    "\n",
    "exists_ratio = lambda sub, main, ratio: sum(x in set(main) for x in sub) / len(sub) >= ratio\n",
    "\n",
    "#=================================================================\n",
    "\n",
    "print('=' * 70)\n",
    "print('Loading Orpheus Music Transformer model...')\n",
    "print('=' * 70)\n",
    "\n",
    "SEQ_LEN = 8192\n",
    "PAD_IDX = 18819\n",
    "\n",
    "model = TransformerWrapper(\n",
    "    num_tokens = PAD_IDX+1,\n",
    "    max_seq_len = SEQ_LEN,\n",
    "    attn_layers = Decoder(dim = 2048,\n",
    "                          depth = 8,\n",
    "                          heads = 32,\n",
    "                          rotary_pos_emb = True,\n",
    "                          attn_flash = True\n",
    "                         )\n",
    "    )\n",
    "\n",
    "model = AutoregressiveWrapper(model, ignore_index = PAD_IDX, pad_value=PAD_IDX)\n",
    "\n",
    "print('=' * 70)\n",
    "print('Loading model checkpoint...')\n",
    "\n",
    "model.load_state_dict(torch.load(model_file, weights_only=True))\n",
    "\n",
    "print('=' * 70)\n",
    "\n",
    "model.cuda()\n",
    "model.eval()\n",
    "\n",
    "print('Done!')\n",
    "\n",
    "summary(model)\n",
    "\n",
    "dtype = torch.bfloat16\n",
    "\n",
    "ctx = torch.amp.autocast(device_type='cuda', dtype=dtype)\n",
    "\n",
    "#=================================================================\n",
    "\n",
    "print('=' * 70)\n",
    "print('Loading Orpheus embeddings dataset...')\n",
    "print('=' * 70)\n",
    "\n",
    "embeddings = np.load(emb_file)\n",
    "\n",
    "print('=' * 70)\n",
    "print('Done!')\n",
    "print('=' * 70)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e3a4cd32-4680-4d35-b46e-0022369715b7",
   "metadata": {},
   "source": [
    "# Create IO dirs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d27279a0-2892-4aca-a074-1ebe3e82bb94",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('=' * 70) \n",
    "print('Creating IO dirs...')\n",
    "\n",
    "input_midis_dir = '/home/ubuntu/Input MIDIs/'\n",
    "output_midis_dir = '/home/ubuntu/Output MIDIs/'\n",
    "\n",
    "midi_files_list = []\n",
    "\n",
    "os.makedirs(input_midis_dir, exist_ok=True)\n",
    "os.makedirs(output_midis_dir, exist_ok=True)\n",
    "\n",
    "print('Done!')\n",
    "print('=' * 70) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03e505df-42bc-4246-8b11-1e98e7b2515a",
   "metadata": {},
   "source": [
    "# Create MIDIs files list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "829f6093-67f5-4655-83c7-8af36ef60079",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('=' * 70)\n",
    "print('Creating MIDI files list...')\n",
    "print('=' * 70) \n",
    "\n",
    "midi_files_list = TMIDIX.create_files_list([input_midis_dir])\n",
    "print('=' * 70) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2e041a5-5564-41d1-a2b2-0cc92dc713ae",
   "metadata": {},
   "source": [
    "# Generate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb9a7864-028b-468f-9fda-72abe84d6edc",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print('=' * 70) \n",
    "print('Orpheus Auto-Continuations Generator')\n",
    "print('=' * 70)\n",
    "\n",
    "#=========================================================================\n",
    "# Generation options\n",
    "#=========================================================================\n",
    "\n",
    "# Primary generation options\n",
    "num_prime_tokens = 1024\n",
    "num_songs_per_midi = 4\n",
    "num_gen_chunks = 12\n",
    "max_num_tries = 4\n",
    "\n",
    "# Model sampling options\n",
    "num_gen_tokens = 512\n",
    "batch_size = 32\n",
    "temperature = 1.0\n",
    "top_p_value = 0.96\n",
    "num_mem_tokens = 7168 # up to 12 chunks\n",
    "use_prime_embeddings = False\n",
    "\n",
    "# Advanced options\n",
    "max_tok_rep_ratio = 0.95\n",
    "num_rep_window_toks = 1024\n",
    "num_emb_tokens = 1024\n",
    "\n",
    "# Aux options\n",
    "score_var = 0.05\n",
    "batch_size_step = 4\n",
    "\n",
    "#=========================================================================\n",
    "\n",
    "if not midi_files_list:\n",
    "    \n",
    "    print('=' * 70)\n",
    "    print('Generating prime tokens...')\n",
    "    print('=' * 70)\n",
    "\n",
    "    x = torch.LongTensor([[18816, 0]] * batch_size).cuda()\n",
    "\n",
    "    with ctx:\n",
    "        out = model.generate(x,\n",
    "                             num_prime_tokens,\n",
    "                             temperature=temperature,\n",
    "                             filter_logits_fn=top_p,\n",
    "                             filter_kwargs={'thres': top_p_value},\n",
    "                             return_prime=True,\n",
    "                             verbose=True)\n",
    "\n",
    "    y = out.tolist()\n",
    "    \n",
    "    inp = torch.LongTensor(y).cuda()\n",
    "    \n",
    "    prime_embs = get_embeddings(inp)\n",
    "    \n",
    "    scores = cosine_similarity(embeddings, prime_embs).max(axis=0)\n",
    "\n",
    "    scores = [o for o in scores if o != max(scores)]\n",
    "\n",
    "    max_score = max(scores)\n",
    "\n",
    "    max_score_idx = scores.index(max_score)\n",
    "    melody_chords = y[max_score_idx]\n",
    "\n",
    "    midi_fname = 'Improvisation'\n",
    "    midi_files_list.append(midi_fname)\n",
    "    \n",
    "    print('=' * 70)\n",
    "    print('Done!')\n",
    "    print('=' * 70)\n",
    "    print('Generating songs for \"Improvisation\"')\n",
    "    print('=' * 70)\n",
    "    \n",
    "#=========================================================================\n",
    "\n",
    "for midi_file in midi_files_list:\n",
    "\n",
    "    if midi_file != 'Improvisation':\n",
    "\n",
    "        midi_fname = os.path.splitext(os.path.basename(midi_file))[0]\n",
    "    \n",
    "        print('=' * 70)\n",
    "        print('Generating songs for MIDI file \"' + midi_fname + '\"')\n",
    "        print('-' * 70)    \n",
    "    \n",
    "        #==============================================================================\n",
    "    \n",
    "        raw_score = TMIDIX.midi2single_track_ms_score(midi_file)\n",
    "        \n",
    "        escore_notes = TMIDIX.advanced_score_processor(raw_score, return_enhanced_score_notes=True, apply_sustain=True)\n",
    "        \n",
    "        escore_notes = TMIDIX.augment_enhanced_score_notes(escore_notes[0], sort_drums_last=True)\n",
    "    \n",
    "        escore_notes = TMIDIX.remove_duplicate_pitches_from_escore_notes(escore_notes)\n",
    "    \n",
    "        escore_notes = TMIDIX.fix_escore_notes_durations(escore_notes, min_notes_gap=0)\n",
    "        \n",
    "        dscore = TMIDIX.delta_score_notes(escore_notes)\n",
    "        \n",
    "        dcscore = TMIDIX.chordify_score([d[1:] for d in dscore])\n",
    "        \n",
    "        melody_chords = [18816]\n",
    "        \n",
    "        #=======================================================\n",
    "        # MAIN PROCESSING CYCLE\n",
    "        #=======================================================\n",
    "        \n",
    "        for i, c in enumerate(dcscore):\n",
    "        \n",
    "            delta_time = c[0][0]\n",
    "        \n",
    "            melody_chords.append(delta_time)\n",
    "        \n",
    "            for e in c:\n",
    "            \n",
    "                #=======================================================\n",
    "                \n",
    "                # Durations\n",
    "                dur = max(1, min(255, e[1]))\n",
    "        \n",
    "                # Patches\n",
    "                pat = max(0, min(128, e[5]))\n",
    "                \n",
    "                # Pitches\n",
    "                ptc = max(1, min(127, e[3]))\n",
    "                \n",
    "                # Velocities\n",
    "                # Calculating octo-velocity\n",
    "                \n",
    "                vel = max(8, min(127, e[4]))\n",
    "                velocity = round(vel / 15)-1\n",
    "                \n",
    "                #=======================================================\n",
    "                # FINAL NOTE SEQ\n",
    "                #=======================================================\n",
    "                \n",
    "                # Writing final note\n",
    "                pat_ptc = (128 * pat) + ptc \n",
    "                dur_vel = (8 * dur) + velocity\n",
    "        \n",
    "                melody_chords.extend([pat_ptc+256, dur_vel+16768]) # 18816\n",
    "\n",
    "    #==============================================================================\n",
    "\n",
    "    print('Total number of input tokens:', len(melody_chords))\n",
    "    print('=' * 70)\n",
    "\n",
    "    #==============================================================================\n",
    "\n",
    "    song_number = 0\n",
    "    \n",
    "    while song_number < num_songs_per_midi:\n",
    "\n",
    "        print('Generating song #', song_number+1, '/', num_songs_per_midi)\n",
    "        print('=' * 70)\n",
    "    \n",
    "        song = melody_chords[:num_prime_tokens][-num_mem_tokens:]\n",
    "        \n",
    "        inp = torch.LongTensor([song]).cuda()\n",
    "        \n",
    "        prime_embs = get_embeddings(inp)\n",
    "        \n",
    "        start_score = cosine_similarity(embeddings, prime_embs).max(axis=0)[0]\n",
    "\n",
    "        b_size = batch_size\n",
    "        stop = False\n",
    "        \n",
    "        for i in tqdm.tqdm(range(num_gen_chunks)):\n",
    "        \n",
    "            max_score = -1\n",
    "            num_tries = 0\n",
    "    \n",
    "            if i > 7:\n",
    "                bsize = b_size - batch_size_step\n",
    "        \n",
    "            while max_score < start_score - score_var:\n",
    "                \n",
    "                output = []\n",
    "                output_scores = []\n",
    "                \n",
    "                x = torch.LongTensor([song[-num_mem_tokens:]] * b_size).cuda()\n",
    "            \n",
    "                with ctx:\n",
    "                    out = model.generate(x,\n",
    "                                         num_gen_tokens,\n",
    "                                         temperature=temperature,\n",
    "                                         filter_logits_fn=top_p,\n",
    "                                         filter_kwargs={'thres': top_p_value},\n",
    "                                         return_prime=True,\n",
    "                                         verbose=False)\n",
    "        \n",
    "                y = out.tolist()\n",
    "        \n",
    "                for yy in y:\n",
    "                    if 18817 not in yy and 18818 not in yy and not exists_ratio(yy[-num_gen_tokens:], \n",
    "                                                                                song[-num_rep_window_toks:], \n",
    "                                                                                max_tok_rep_ratio\n",
    "                                                                               ):\n",
    "                        output.append(yy[-num_emb_tokens:])\n",
    "        \n",
    "                if output:\n",
    "                \n",
    "                    inp = torch.LongTensor(output).cuda()\n",
    "                    \n",
    "                    embs = get_embeddings(inp)\n",
    "\n",
    "                    if use_prime_embeddings:\n",
    "                        scores = cosine_similarity(prime_embs, embs).max(axis=0)    \n",
    "\n",
    "                    else:\n",
    "                        scores = cosine_similarity(embeddings, embs).max(axis=0)\n",
    "                        \n",
    "                    output_scores.extend(scores)\n",
    "            \n",
    "                    scores = [o for o in output_scores if o != max(output_scores)]\n",
    "                    \n",
    "                    if not scores:\n",
    "                        max_score = -1\n",
    "                        num_tries += 1\n",
    "    \n",
    "                        if num_tries == max_num_tries:\n",
    "                            stop = True\n",
    "                            break\n",
    "                            \n",
    "                        if i > max_num_tries:\n",
    "                            song = song[:-num_gen_tokens]\n",
    "                            \n",
    "                    else:\n",
    "                        max_score = max(scores)\n",
    "                        \n",
    "                else:\n",
    "                    num_tries += 1\n",
    "                    \n",
    "                    if num_tries == max_num_tries:\n",
    "                        stop = True\n",
    "                        break\n",
    "                        \n",
    "                    if i > max_num_tries:\n",
    "                        song = song[:-num_gen_tokens]\n",
    "                        \n",
    "            if stop:\n",
    "                break\n",
    "        \n",
    "            max_score_idx = output_scores.index(max_score)\n",
    "            max_score_chunk = output[max_score_idx]\n",
    "        \n",
    "            song.extend(max_score_chunk[-num_gen_tokens:])\n",
    "\n",
    "        #==============================================================================\n",
    "    \n",
    "        if i > num_gen_chunks // 2:\n",
    "\n",
    "            print('=' * 70)\n",
    "            print('Saving song...')\n",
    "            print('=' * 70)\n",
    "        \n",
    "            print('Sample INTs', song[:15])\n",
    "            \n",
    "            song_f = []\n",
    "            \n",
    "            time = 0\n",
    "            dur = 1\n",
    "            vel = 90\n",
    "            pitch = 60\n",
    "            channel = 0\n",
    "            patch = 0\n",
    "            \n",
    "            patches = [-1] * 16\n",
    "            \n",
    "            channels = [0] * 16\n",
    "            channels[9] = 1\n",
    "            \n",
    "            for ss in song:\n",
    "            \n",
    "                if 0 <= ss < 256:\n",
    "            \n",
    "                    time += ss * 16\n",
    "            \n",
    "                if 256 <= ss < 16768:\n",
    "            \n",
    "                    patch = (ss-256) // 128\n",
    "            \n",
    "                    if patch < 128:\n",
    "            \n",
    "                        if patch not in patches:\n",
    "                          if 0 in channels:\n",
    "                              cha = channels.index(0)\n",
    "                              channels[cha] = 1\n",
    "                          else:\n",
    "                              cha = 15\n",
    "            \n",
    "                          patches[cha] = patch\n",
    "                          channel = patches.index(patch)\n",
    "                        else:\n",
    "                          channel = patches.index(patch)\n",
    "            \n",
    "                    if patch == 128:\n",
    "                        channel = 9\n",
    "            \n",
    "                    pitch = (ss-256) % 128\n",
    "            \n",
    "            \n",
    "                if 16768 <= ss < 18816:\n",
    "            \n",
    "                    dur = ((ss-16768) // 8) * 16\n",
    "                    vel = (((ss-16768) % 8)+1) * 15\n",
    "            \n",
    "                    song_f.append(['note', time, dur, channel, pitch, vel, patch])\n",
    "            \n",
    "            patches = [0 if x==-1 else x for x in patches]\n",
    "\n",
    "            output_score, patches, overflow_patches = TMIDIX.patch_enhanced_score_notes(song_f)\n",
    "\n",
    "            output_dir = os.path.join(output_midis_dir, midi_fname)\n",
    "\n",
    "            os.makedirs(output_dir, exist_ok=True)\n",
    "            \n",
    "            detailed_stats = TMIDIX.Tegridy_ms_SONG_to_MIDI_Converter(output_score,\n",
    "                                                                      output_signature = 'Orpheus Music Transformer',\n",
    "                                                                      output_file_name =  output_dir + '/Orpheus-Music-Transformer-Composition_'+str(song_number+1).zfill(3),\n",
    "                                                                      track_name='Project Los Angeles',\n",
    "                                                                      list_of_MIDI_patches=patches\n",
    "                                                                      )\n",
    "\n",
    "            song_number += 1\n",
    "            \n",
    "            print('=' * 70)\n",
    "            print('Done!')\n",
    "            print('=' * 70)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f892ac8a-9f5f-462d-b3b9-d4be1f78b31d",
   "metadata": {},
   "source": [
    "# Congrats! You did it ! :)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}