Spaces:

SteveZerb
/

Modified-AI-Midi-Tool-Space-IAT-360

Running

App Files Files Community

SteveZerb commited on Dec 8, 2024

Commit

671fe47

verified ·

1 Parent(s): da5ef74

Upload 10 files

Browse files

Files changed (10) hide show

Dockerfile +43 -0
MIDI.py +1735 -0
README.md +8 -8
app.py +533 -0
app_onnx.py +625 -0
midi_model.py +250 -0
midi_synthesizer.py +81 -0
midi_tokenizer.py +1196 -0
packages.txt +1 -0
requirements.txt +11 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,43 @@

+FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04
+ARG DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+RUN apt-get update && apt-get install --no-install-recommends -y \
+  build-essential \
+  python3.9 \
+  python3-pip \
+  git \
+  ffmpeg \
+  fluidsynth \
+  && apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python3", "app.py"]

MIDI.py ADDED Viewed

	@@ -0,0 +1,1735 @@

+#! /usr/bin/python3
+# unsupported 20091104 ...
+#     ['set_sequence_number', dtime, sequence]
+#     ['raw_data', dtime, raw]
+# 20150914   jimbo1qaz   MIDI.py str/bytes bug report
+# I found a MIDI file which had Shift-JIS titles. When midi.py decodes it as
+# latin-1, it produces a string which cannot even be accessed without raising
+# a UnicodeDecodeError.  Maybe, when converting raw byte strings from MIDI,
+# you should keep them as bytes, not improperly decode them.  However, this
+# would change the API.  (ie: text = a "string" ? of 0 or more bytes).  It
+# could break compatiblity, but there's not much else you can do to fix the bug
+# https://en.wikipedia.org/wiki/Shift_JIS
+r'''
+This module offers functions:  concatenate_scores(), grep(),
+merge_scores(), mix_scores(), midi2opus(), midi2score(), opus2midi(),
+opus2score(), play_score(), score2midi(), score2opus(), score2stats(),
+score_type(), segment(), timeshift() and to_millisecs(),
+where "midi" means the MIDI-file bytes (as can be put in a .mid file,
+or piped into aplaymidi), and "opus" and "score" are list-structures
+as inspired by Sean Burke's MIDI-Perl CPAN module.
+Warning: Version 6.4 is not necessarily backward-compatible with
+previous versions, in that text-data is now bytes, not strings.
+This reflects the fact that many MIDI files have text data in
+encodings other that ISO-8859-1, for example in Shift-JIS.
+Download MIDI.py from   http://www.pjb.com.au/midi/free/MIDI.py
+and put it in your PYTHONPATH.  MIDI.py depends on Python3.
+There is also a call-compatible translation into Lua of this
+module: see http://www.pjb.com.au/comp/lua/MIDI.html
+The "opus" is a direct translation of the midi-file-events, where
+the times are delta-times, in ticks, since the previous event.
+The "score" is more human-centric; it uses absolute times, and
+combines the separate note_on and note_off events into one "note"
+event, with a duration:
+ ['note', start_time, duration, channel, note, velocity] # in a "score"
+  EVENTS (in an "opus" structure)
+     ['note_off', dtime, channel, note, velocity]       # in an "opus"
+     ['note_on', dtime, channel, note, velocity]        # in an "opus"
+     ['key_after_touch', dtime, channel, note, velocity]
+     ['control_change', dtime, channel, controller(0-127), value(0-127)]
+     ['patch_change', dtime, channel, patch]
+     ['channel_after_touch', dtime, channel, velocity]
+     ['pitch_wheel_change', dtime, channel, pitch_wheel]
+     ['text_event', dtime, text]
+     ['copyright_text_event', dtime, text]
+     ['track_name', dtime, text]
+     ['instrument_name', dtime, text]
+     ['lyric', dtime, text]
+     ['marker', dtime, text]
+     ['cue_point', dtime, text]
+     ['text_event_08', dtime, text]
+     ['text_event_09', dtime, text]
+     ['text_event_0a', dtime, text]
+     ['text_event_0b', dtime, text]
+     ['text_event_0c', dtime, text]
+     ['text_event_0d', dtime, text]
+     ['text_event_0e', dtime, text]
+     ['text_event_0f', dtime, text]
+     ['end_track', dtime]
+     ['set_tempo', dtime, tempo]
+     ['smpte_offset', dtime, hr, mn, se, fr, ff]
+     ['time_signature', dtime, nn, dd, cc, bb]
+     ['key_signature', dtime, sf, mi]
+     ['sequencer_specific', dtime, raw]
+     ['raw_meta_event', dtime, command(0-255), raw]
+     ['sysex_f0', dtime, raw]
+     ['sysex_f7', dtime, raw]
+     ['song_position', dtime, song_pos]
+     ['song_select', dtime, song_number]
+     ['tune_request', dtime]
+  DATA TYPES
+     channel = a value 0 to 15
+     controller = 0 to 127 (see http://www.pjb.com.au/muscript/gm.html#cc )
+     dtime = time measured in "ticks", 0 to 268435455
+     velocity = a value 0 (soft) to 127 (loud)
+     note = a value 0 to 127  (middle-C is 60)
+     patch = 0 to 127 (see http://www.pjb.com.au/muscript/gm.html )
+     pitch_wheel = a value -8192 to 8191 (0x1FFF)
+     raw = bytes, of length 0 or more  (for sysex events see below)
+     sequence_number = a value 0 to 65,535 (0xFFFF)
+     song_pos = a value 0 to 16,383 (0x3FFF)
+     song_number = a value 0 to 127
+     tempo = microseconds per crochet (quarter-note), 0 to 16777215
+     text = bytes, of length 0 or more
+     ticks = the number of ticks per crochet (quarter-note)
+   In sysex_f0 events, the raw data must not start with a \xF0 byte,
+   since this gets added automatically;
+   but it must end with an explicit \xF7 byte!
+   In the very unlikely case that you ever need to split sysex data
+   into one sysex_f0 followed by one or more sysex_f7s, then only the
+   last of those sysex_f7 events must end with the explicit \xF7 byte
+   (again, the raw data of individual sysex_f7 events must not start
+   with any \xF7 byte, since this gets added automatically).
+   Since version 6.4, text data is in bytes, not in a ISO-8859-1 string.
+  GOING THROUGH A SCORE WITHIN A PYTHON PROGRAM
+    channels = {2,3,5,8,13}
+    itrack = 1   # skip 1st element which is ticks
+    while itrack < len(score):
+        for event in score[itrack]:
+            if event[0] == 'note':   # for example,
+                pass  # do something to all notes
+            # or, to work on events in only particular channels...
+            channel_index = MIDI.Event2channelindex.get(event[0], False)
+            if channel_index and (event[channel_index] in channels):
+                pass  # do something to channels 2,3,5,8 and 13
+        itrack += 1
+'''
+import sys, struct, copy
+# sys.stdout = os.fdopen(sys.stdout.fileno(), 'wb')
+Version = '6.7'
+VersionDate = '20201120'
+# 20201120 6.7 call to bytest() removed, and protect _unshift_ber_int
+# 20160702 6.6 to_millisecs() now handles set_tempo across multiple Tracks
+# 20150921 6.5 segment restores controllers as well as patch and tempo
+# 20150914 6.4 text data is bytes or bytearray, not ISO-8859-1 strings
+# 20150628 6.3 absent any set_tempo, default is 120bpm (see MIDI file spec 1.1)
+# 20150101 6.2 all text events can be 8-bit; let user get the right encoding
+# 20141231 6.1 fix _some_text_event; sequencer_specific data can be 8-bit
+# 20141230 6.0 synth_specific data can be 8-bit
+# 20120504 5.9 add the contents of mid_opus_tracks()
+# 20120208 5.8 fix num_notes_by_channel() ; should be a dict
+# 20120129 5.7 _encode handles empty tracks; score2stats num_notes_by_channel
+# 20111111 5.6 fix patch 45 and 46 in Number2patch, should be Harp
+# 20110129 5.5 add mix_opus_tracks() and event2alsaseq()
+# 20110126 5.4 "previous message repeated N times" to save space on stderr
+# 20110125 5.2 opus2score terminates unended notes at the end of the track
+# 20110124 5.1 the warnings in midi2opus display track_num
+# 21110122 5.0 if garbage, midi2opus returns the opus so far
+# 21110119 4.9 non-ascii chars stripped out of the text_events
+# 21110110 4.8 note_on with velocity=0 treated as a note-off
+# 21110108 4.6 unknown F-series event correctly eats just one byte
+# 21011010 4.2 segment() uses start_time, end_time named params
+# 21011005 4.1 timeshift() must not pad the set_tempo command
+# 21011003 4.0 pitch2note_event must be chapitch2note_event
+# 21010918 3.9 set_sequence_number supported, FWIW
+# 20100913 3.7 many small bugfixes; passes all tests
+# 20100910 3.6 concatenate_scores enforce ticks=1000, just like merge_scores
+# 20100908 3.5 minor bugs fixed in score2stats
+# 20091104 3.4 tune_request now supported
+# 20091104 3.3 fixed bug in decoding song_position and song_select
+# 20091104 3.2 unsupported: set_sequence_number tune_request raw_data
+# 20091101 3.1 document how to traverse a score within Python
+# 20091021 3.0 fixed bug in score2stats detecting GM-mode = 0
+# 20091020 2.9 score2stats reports GM-mode and bank msb,lsb events
+# 20091019 2.8 in merge_scores, channel 9 must remain channel 9 (in GM)
+# 20091018 2.7 handles empty tracks gracefully
+# 20091015 2.6 grep() selects channels
+# 20091010 2.5 merge_scores reassigns channels to avoid conflicts
+# 20091010 2.4 fixed bug in to_millisecs which now only does opusses
+# 20091010 2.3 score2stats returns channels & patch_changes, by_track & total
+# 20091010 2.2 score2stats() returns also pitches and percussion dicts
+# 20091010 2.1 bugs: >= not > in segment, to notice patch_change at time 0
+# 20091010 2.0 bugs: spurious pop(0) ( in _decode sysex
+# 20091008 1.9 bugs: ISO decoding in sysex; str( not int( in note-off warning
+# 20091008 1.8 add concatenate_scores()
+# 20091006 1.7 score2stats() measures nticks and ticks_per_quarter
+# 20091004 1.6 first mix_scores() and merge_scores()
+# 20090424 1.5 timeshift() bugfix: earliest only sees events after from_time
+# 20090330 1.4 timeshift() has also a from_time argument
+# 20090322 1.3 timeshift() has also a start_time argument
+# 20090319 1.2 add segment() and timeshift()
+# 20090301 1.1 add to_millisecs()
+_previous_warning = ''  # 5.4
+_previous_times = 0     # 5.4
+_no_warning = True
+#------------------------------- Encoding stuff --------------------------
+def opus2midi(opus=[]):
+    r'''The argument is a list: the first item in the list is the "ticks"
+parameter, the others are the tracks. Each track is a list
+of midi-events, and each event is itself a list; see above.
+opus2midi() returns a bytestring of the MIDI, which can then be
+written either to a file opened in binary mode (mode='wb'),
+or to stdout by means of:   sys.stdout.buffer.write()
+my_opus = [
+    96,
+    [   # track 0:
+        ['patch_change', 0, 1, 8],   # and these are the events...
+        ['note_on',   5, 1, 25, 96],
+        ['note_off', 96, 1, 25, 0],
+        ['note_on',   0, 1, 29, 96],
+        ['note_off', 96, 1, 29, 0],
+    ],   # end of track 0
+]
+my_midi = opus2midi(my_opus)
+sys.stdout.buffer.write(my_midi)
+'''
+    if len(opus) < 2:
+        opus=[1000, [],]
+    tracks = copy.deepcopy(opus)
+    ticks = int(tracks.pop(0))
+    ntracks = len(tracks)
+    if ntracks == 1:
+        format = 0
+    else:
+        format = 1
+    my_midi = b"MThd\x00\x00\x00\x06"+struct.pack('>HHH',format,ntracks,ticks)
+    for track in tracks:
+        events = _encode(track)
+        my_midi += b'MTrk' + struct.pack('>I',len(events)) + events
+    _clean_up_warnings()
+    return my_midi
+def score2opus(score=None):
+    r'''
+The argument is a list: the first item in the list is the "ticks"
+parameter, the others are the tracks. Each track is a list
+of score-events, and each event is itself a list.  A score-event
+is similar to an opus-event (see above), except that in a score:
+ 1) the times are expressed as an absolute number of ticks
+    from the track's start time
+ 2) the pairs of 'note_on' and 'note_off' events in an "opus"
+    are abstracted into a single 'note' event in a "score":
+    ['note', start_time, duration, channel, pitch, velocity]
+score2opus() returns a list specifying the equivalent "opus".
+my_score = [
+    96,
+    [   # track 0:
+        ['patch_change', 0, 1, 8],
+        ['note', 5, 96, 1, 25, 96],
+        ['note', 101, 96, 1, 29, 96]
+    ],   # end of track 0
+]
+my_opus = score2opus(my_score)
+'''
+    if len(score) < 2:
+        score=[1000, [],]
+    tracks = copy.deepcopy(score)
+    ticks = int(tracks.pop(0))
+    opus_tracks = []
+    for scoretrack in tracks:
+        time2events = dict([])
+        for scoreevent in scoretrack:
+            if scoreevent[0] == 'note':
+                note_on_event = ['note_on',scoreevent[1],
+                 scoreevent[3],scoreevent[4],scoreevent[5]]
+                note_off_event = ['note_off',scoreevent[1]+scoreevent[2],
+                 scoreevent[3],scoreevent[4],scoreevent[5]]
+                if time2events.get(note_on_event[1]):
+                   time2events[note_on_event[1]].append(note_on_event)
+                else:
+                   time2events[note_on_event[1]] = [note_on_event,]
+                if time2events.get(note_off_event[1]):
+                   time2events[note_off_event[1]].append(note_off_event)
+                else:
+                   time2events[note_off_event[1]] = [note_off_event,]
+                continue
+            if time2events.get(scoreevent[1]):
+               time2events[scoreevent[1]].append(scoreevent)
+            else:
+               time2events[scoreevent[1]] = [scoreevent,]
+        sorted_times = []  # list of keys
+        for k in time2events.keys():
+            sorted_times.append(k)
+        sorted_times.sort()
+        sorted_events = []  # once-flattened list of values sorted by key
+        for time in sorted_times:
+            sorted_events.extend(time2events[time])
+        abs_time = 0
+        for event in sorted_events:  # convert abs times => delta times
+            delta_time = event[1] - abs_time
+            abs_time = event[1]
+            event[1] = delta_time
+        opus_tracks.append(sorted_events)
+    opus_tracks.insert(0,ticks)
+    _clean_up_warnings()
+    return opus_tracks
+def score2midi(score=None):
+    r'''
+Translates a "score" into MIDI, using score2opus() then opus2midi()
+'''
+    return opus2midi(score2opus(score))
+#--------------------------- Decoding stuff ------------------------
+def midi2opus(midi=b''):
+    r'''Translates MIDI into a "opus".  For a description of the
+"opus" format, see opus2midi()
+'''
+    my_midi=bytearray(midi)
+    if len(my_midi) < 4:
+        _clean_up_warnings()
+        return [1000,[],]
+    id = bytes(my_midi[0:4])
+    if id != b'MThd':
+        _warn("midi2opus: midi starts with "+str(id)+" instead of 'MThd'")
+        _clean_up_warnings()
+        return [1000,[],]
+    [length, format, tracks_expected, ticks] = struct.unpack(
+     '>IHHH', bytes(my_midi[4:14]))
+    if length != 6:
+        _warn("midi2opus: midi header length was "+str(length)+" instead of 6")
+        _clean_up_warnings()
+        return [1000,[],]
+    my_opus = [ticks,]
+    my_midi = my_midi[14:]
+    track_num = 1   # 5.1
+    while len(my_midi) >= 8:
+        track_type   = bytes(my_midi[0:4])
+        if track_type != b'MTrk':
+            _warn('midi2opus: Warning: track #'+str(track_num)+' type is '+str(track_type)+" instead of b'MTrk'")
+        [track_length] = struct.unpack('>I', my_midi[4:8])
+        my_midi = my_midi[8:]
+        if track_length > len(my_midi):
+            _warn('midi2opus: track #'+str(track_num)+' length '+str(track_length)+' is too large')
+            _clean_up_warnings()
+            return my_opus   # 5.0
+        my_midi_track = my_midi[0:track_length]
+        my_track = _decode(my_midi_track)
+        my_opus.append(my_track)
+        my_midi = my_midi[track_length:]
+        track_num += 1   # 5.1
+    _clean_up_warnings()
+    return my_opus
+def opus2score(opus=[]):
+    r'''For a description of the "opus" and "score" formats,
+see opus2midi() and score2opus().
+'''
+    if len(opus) < 2:
+        _clean_up_warnings()
+        return [1000,[],]
+    tracks = copy.deepcopy(opus)  # couple of slices probably quicker...
+    ticks = int(tracks.pop(0))
+    score = [ticks,]
+    for opus_track in tracks:
+        ticks_so_far = 0
+        score_track = []
+        chapitch2note_on_events = dict([])   # 4.0
+        for opus_event in opus_track:
+            ticks_so_far += opus_event[1]
+            if opus_event[0] == 'note_off' or (opus_event[0] == 'note_on' and opus_event[4] == 0):  # 4.8
+                cha = opus_event[2]
+                pitch = opus_event[3]
+                key = cha*128 + pitch
+                if chapitch2note_on_events.get(key):
+                    new_event = chapitch2note_on_events[key].pop(0)
+                    new_event[2] = ticks_so_far - new_event[1]
+                    score_track.append(new_event)
+                elif pitch > 127:
+                    pass #_warn('opus2score: note_off with no note_on, bad pitch='+str(pitch))
+                else:
+                    pass #_warn('opus2score: note_off with no note_on cha='+str(cha)+' pitch='+str(pitch))
+            elif opus_event[0] == 'note_on':
+                cha = opus_event[2]
+                pitch = opus_event[3]
+                key = cha*128 + pitch
+                new_event = ['note',ticks_so_far,0,cha,pitch, opus_event[4]]
+                if chapitch2note_on_events.get(key):
+                    chapitch2note_on_events[key].append(new_event)
+                else:
+                    chapitch2note_on_events[key] = [new_event,]
+            else:
+                opus_event[1] = ticks_so_far
+                score_track.append(opus_event)
+        # check for unterminated notes (Oisín) -- 5.2
+        for chapitch in chapitch2note_on_events:
+            note_on_events = chapitch2note_on_events[chapitch]
+            for new_e in note_on_events:
+                new_e[2] = ticks_so_far - new_e[1]
+                score_track.append(new_e)
+                pass #_warn("opus2score: note_on with no note_off cha="+str(new_e[3])+' pitch='+str(new_e[4])+'; adding note_off at end')
+        score.append(score_track)
+    _clean_up_warnings()
+    return score
+def midi2score(midi=b''):
+    r'''
+Translates MIDI into a "score", using midi2opus() then opus2score()
+'''
+    return opus2score(midi2opus(midi))
+def midi2ms_score(midi=b''):
+    r'''
+Translates MIDI into a "score" with one beat per second and one
+tick per millisecond, using midi2opus() then to_millisecs()
+then opus2score()
+'''
+    return opus2score(to_millisecs(midi2opus(midi)))
+#------------------------ Other Transformations ---------------------
+def to_millisecs(old_opus=None):
+    r'''Recallibrates all the times in an "opus" to use one beat
+per second and one tick per millisecond.  This makes it
+hard to retrieve any information about beats or barlines,
+but it does make it easy to mix different scores together.
+'''
+    if old_opus == None:
+        return [1000,[],]
+    try:
+        old_tpq  = int(old_opus[0])
+    except IndexError:   # 5.0
+        _warn('to_millisecs: the opus '+str(type(old_opus))+' has no elements')
+        return [1000,[],]
+    new_opus = [1000,]
+    # 6.7 first go through building a table of set_tempos by absolute-tick
+    ticks2tempo = {}
+    itrack = 1
+    while itrack < len(old_opus):
+        ticks_so_far = 0
+        for old_event in old_opus[itrack]:
+            if old_event[0] == 'note':
+                raise TypeError('to_millisecs needs an opus, not a score')
+            ticks_so_far += old_event[1]
+            if old_event[0] == 'set_tempo':
+                ticks2tempo[ticks_so_far] = old_event[2]
+        itrack += 1
+    # then get the sorted-array of their keys
+    tempo_ticks = []  # list of keys
+    for k in ticks2tempo.keys():
+        tempo_ticks.append(k)
+    tempo_ticks.sort()
+    # then go through converting to millisec, testing if the next
+    # set_tempo lies before the next track-event, and using it if so.
+    itrack = 1
+    while itrack < len(old_opus):
+        ms_per_old_tick = 500.0 / old_tpq  # float: will round later 6.3
+        i_tempo_ticks = 0
+        ticks_so_far = 0
+        ms_so_far = 0.0
+        previous_ms_so_far = 0.0
+        new_track = [['set_tempo',0,1000000],]  # new "crochet" is 1 sec
+        for old_event in old_opus[itrack]:
+            # detect if ticks2tempo has something before this event
+            # 20160702 if ticks2tempo is at the same time, leave it
+            event_delta_ticks = old_event[1]
+            if (i_tempo_ticks < len(tempo_ticks) and
+              tempo_ticks[i_tempo_ticks] < (ticks_so_far + old_event[1])):
+                delta_ticks = tempo_ticks[i_tempo_ticks] - ticks_so_far
+                ms_so_far += (ms_per_old_tick * delta_ticks)
+                ticks_so_far = tempo_ticks[i_tempo_ticks]
+                ms_per_old_tick = ticks2tempo[ticks_so_far] / (1000.0*old_tpq)
+                i_tempo_ticks += 1
+                event_delta_ticks -= delta_ticks
+            new_event = copy.deepcopy(old_event)  # now handle the new event
+            ms_so_far += (ms_per_old_tick * old_event[1])
+            new_event[1] = round(ms_so_far - previous_ms_so_far)
+            if old_event[0] != 'set_tempo':
+                previous_ms_so_far = ms_so_far
+                new_track.append(new_event)
+            ticks_so_far += event_delta_ticks
+        new_opus.append(new_track)
+        itrack += 1
+    _clean_up_warnings()
+    return new_opus
+def event2alsaseq(event=None):   # 5.5
+    r'''Converts an event into the format needed by the alsaseq module,
+http://pp.com.mx/python/alsaseq
+The type of track (opus or score) is autodetected.
+'''
+    pass
+def grep(score=None, channels=None):
+    r'''Returns a "score" containing only the channels specified
+'''
+    if score == None:
+        return [1000,[],]
+    ticks = score[0]
+    new_score = [ticks,]
+    if channels == None:
+        return new_score
+    channels = set(channels)
+    global Event2channelindex
+    itrack = 1
+    while itrack < len(score):
+        new_score.append([])
+        for event in score[itrack]:
+            channel_index = Event2channelindex.get(event[0], False)
+            if channel_index:
+                if event[channel_index] in channels:
+                    new_score[itrack].append(event)
+            else:
+                new_score[itrack].append(event)
+        itrack += 1
+    return new_score
+def play_score(score=None):
+    r'''Converts the "score" to midi, and feeds it into 'aplaymidi -'
+'''
+    if score == None:
+        return
+    import subprocess
+    pipe = subprocess.Popen(['aplaymidi','-'], stdin=subprocess.PIPE)
+    if score_type(score) == 'opus':
+        pipe.stdin.write(opus2midi(score))
+    else:
+        pipe.stdin.write(score2midi(score))
+    pipe.stdin.close()
+def timeshift(score=None, shift=None, start_time=None, from_time=0, tracks={0,1,2,3,4,5,6,7,8,10,12,13,14,15}):
+    r'''Returns a "score" shifted in time by "shift" ticks, or shifted
+so that the first event starts at "start_time" ticks.
+If "from_time" is specified, only those events in the score
+that begin after it are shifted. If "start_time" is less than
+"from_time" (or "shift" is negative), then the intermediate
+notes are deleted, though patch-change events are preserved.
+If "tracks" are specified, then only those tracks get shifted.
+"tracks" can be a list, tuple or set; it gets converted to set
+internally.
+It is deprecated to specify both "shift" and "start_time".
+If this does happen, timeshift() will print a warning to
+stderr and ignore the "shift" argument.
+If "shift" is negative and sufficiently large that it would
+leave some event with a negative tick-value, then the score
+is shifted so that the first event occurs at time 0. This
+also occurs if "start_time" is negative, and is also the
+default if neither "shift" nor "start_time" are specified.
+'''
+    #_warn('tracks='+str(tracks))
+    if score == None or len(score) < 2:
+        return [1000, [],]
+    new_score = [score[0],]
+    my_type = score_type(score)
+    if my_type == '':
+        return new_score
+    if my_type == 'opus':
+        _warn("timeshift: opus format is not supported\n")
+        # _clean_up_scores()  6.2; doesn't exist! what was it supposed to do?
+        return new_score
+    if not (shift == None) and not (start_time == None):
+        _warn("timeshift: shift and start_time specified: ignoring shift\n")
+        shift = None
+    if shift == None:
+        if (start_time == None) or (start_time < 0):
+            start_time = 0
+        # shift = start_time - from_time
+    i = 1   # ignore first element (ticks)
+    tracks = set(tracks)  # defend against tuples and lists
+    earliest = 1000000000
+    if not (start_time == None) or shift < 0:  # first find the earliest event
+        while i < len(score):
+            if len(tracks) and not ((i-1) in tracks):
+                i += 1
+                continue
+            for event in score[i]:
+                 if event[1] < from_time:
+                     continue  # just inspect the to_be_shifted events
+                 if event[1] < earliest:
+                     earliest = event[1]
+            i += 1
+    if earliest > 999999999:
+        earliest = 0
+    if shift == None:
+        shift = start_time - earliest
+    elif (earliest + shift) < 0:
+        start_time = 0
+        shift = 0 - earliest
+    i = 1   # ignore first element (ticks)
+    while i < len(score):
+        if len(tracks) == 0 or not ((i-1) in tracks):  # 3.8
+            new_score.append(score[i])
+            i += 1
+            continue
+        new_track = []
+        for event in score[i]:
+            new_event = list(event)
+            #if new_event[1] == 0 and shift > 0 and new_event[0] != 'note':
+            #    pass
+            #elif new_event[1] >= from_time:
+            if new_event[1] >= from_time:
+                # 4.1 must not rightshift set_tempo
+                if new_event[0] != 'set_tempo' or shift<0:
+                    new_event[1] += shift
+            elif (shift < 0) and (new_event[1] >= (from_time+shift)):
+                continue
+            new_track.append(new_event)
+        if len(new_track) > 0:
+            new_score.append(new_track)
+        i += 1
+    _clean_up_warnings()
+    return new_score
+def segment(score=None, start_time=None, end_time=None, start=0, end=100000000,
+ tracks={0,1,2,3,4,5,6,7,8,10,11,12,13,14,15}):
+    r'''Returns a "score" which is a segment of the one supplied
+as the argument, beginning at "start_time" ticks and ending
+at "end_time" ticks (or at the end if "end_time" is not supplied).
+If the set "tracks" is specified, only those tracks will
+be returned.
+'''
+    if score == None or len(score) < 2:
+        return [1000, [],]
+    if start_time == None:  # as of 4.2 start_time is recommended
+        start_time = start  # start is legacy usage
+    if end_time == None:    # likewise
+        end_time = end
+    new_score = [score[0],]
+    my_type = score_type(score)
+    if my_type == '':
+        return new_score
+    if my_type == 'opus':
+        # more difficult (disconnecting note_on's from their note_off's)...
+        _warn("segment: opus format is not supported\n")
+        _clean_up_warnings()
+        return new_score
+    i = 1   # ignore first element (ticks); we count in ticks anyway
+    tracks = set(tracks)  # defend against tuples and lists
+    while i < len(score):
+        if len(tracks) and not ((i-1) in tracks):
+            i += 1
+            continue
+        new_track = []
+        channel2cc_num  = {}     # most recent controller change before start
+        channel2cc_val  = {}
+        channel2cc_time = {}
+        channel2patch_num  = {}  # keep most recent patch change before start
+        channel2patch_time = {}
+        set_tempo_num  = 500000 # most recent tempo change before start 6.3
+        set_tempo_time = 0
+        earliest_note_time = end_time
+        for event in score[i]:
+            if event[0] == 'control_change':  # 6.5
+                cc_time = channel2cc_time.get(event[2]) or 0
+                if (event[1] <= start_time) and (event[1] >= cc_time):
+                    channel2cc_num[event[2]]  = event[3]
+                    channel2cc_val[event[2]]  = event[4]
+                    channel2cc_time[event[2]] = event[1]
+            elif event[0] == 'patch_change':
+                patch_time = channel2patch_time.get(event[2]) or 0
+                if (event[1]<=start_time) and (event[1] >= patch_time):  # 2.0
+                    channel2patch_num[event[2]]  = event[3]
+                    channel2patch_time[event[2]] = event[1]
+            elif event[0] == 'set_tempo':
+                if (event[1]<=start_time) and (event[1]>=set_tempo_time): #6.4
+                    set_tempo_num  = event[2]
+                    set_tempo_time = event[1]
+            if (event[1] >= start_time) and (event[1] <= end_time):
+                new_track.append(event)
+                if (event[0] == 'note') and (event[1] < earliest_note_time):
+                    earliest_note_time = event[1]
+        if len(new_track) > 0:
+            new_track.append(['set_tempo', start_time, set_tempo_num])
+            for c in channel2patch_num:
+                new_track.append(['patch_change',start_time,c,channel2patch_num[c]],)
+            for c in channel2cc_num:   # 6.5
+                new_track.append(['control_change',start_time,c,channel2cc_num[c],channel2cc_val[c]])
+            new_score.append(new_track)
+        i += 1
+    _clean_up_warnings()
+    return new_score
+def score_type(opus_or_score=None):
+    r'''Returns a string, either 'opus' or 'score' or ''
+'''
+    if opus_or_score == None or str(type(opus_or_score)).find('list')<0 or len(opus_or_score) < 2:
+        return ''
+    i = 1   # ignore first element
+    while i < len(opus_or_score):
+        for event in opus_or_score[i]:
+            if event[0] == 'note':
+                return 'score'
+            elif event[0] == 'note_on':
+                return 'opus'
+        i += 1
+    return ''
+def concatenate_scores(scores):
+    r'''Concatenates a list of scores into one score.
+If the scores differ in their "ticks" parameter,
+they will all get converted to millisecond-tick format.
+'''
+    # the deepcopys are needed if the input_score's are refs to the same obj
+    # e.g. if invoked by midisox's repeat()
+    input_scores = _consistentise_ticks(scores)  # 3.7
+    output_score = copy.deepcopy(input_scores[0])
+    for input_score in input_scores[1:]:
+        output_stats = score2stats(output_score)
+        delta_ticks = output_stats['nticks']
+        itrack = 1
+        while itrack < len(input_score):
+            if itrack >= len(output_score): # new output track if doesn't exist
+                output_score.append([])
+            for event in input_score[itrack]:
+                output_score[itrack].append(copy.deepcopy(event))
+                output_score[itrack][-1][1] += delta_ticks
+            itrack += 1
+    return output_score
+def merge_scores(scores):
+    r'''Merges a list of scores into one score.  A merged score comprises
+all of the tracks from all of the input scores; un-merging is possible
+by selecting just some of the tracks.  If the scores differ in their
+"ticks" parameter, they will all get converted to millisecond-tick
+format.  merge_scores attempts to resolve channel-conflicts,
+but there are of course only 15 available channels...
+'''
+    input_scores = _consistentise_ticks(scores)  # 3.6
+    output_score = [1000]
+    channels_so_far = set()
+    all_channels = {0,1,2,3,4,5,6,7,8,10,11,12,13,14,15}
+    global Event2channelindex
+    for input_score in input_scores:
+        new_channels = set(score2stats(input_score).get('channels_total', []))
+        new_channels.discard(9)  # 2.8 cha9 must remain cha9 (in GM)
+        for channel in channels_so_far & new_channels:
+            # consistently choose lowest avaiable, to ease testing
+            free_channels = list(all_channels - (channels_so_far|new_channels))
+            if len(free_channels) > 0:
+                free_channels.sort()
+                free_channel = free_channels[0]
+            else:
+                free_channel = None
+                break
+            itrack = 1
+            while itrack < len(input_score):
+                for input_event in input_score[itrack]:
+                    channel_index=Event2channelindex.get(input_event[0],False)
+                    if channel_index and input_event[channel_index]==channel:
+                        input_event[channel_index] = free_channel
+                itrack += 1
+            channels_so_far.add(free_channel)
+        channels_so_far |= new_channels
+        output_score.extend(input_score[1:])
+    return output_score
+def _ticks(event):
+    return event[1]
+def mix_opus_tracks(input_tracks):   # 5.5
+    r'''Mixes an array of tracks into one track.  A mixed track
+cannot be un-mixed.  It is assumed that the tracks share the same
+ticks parameter and the same tempo.
+Mixing score-tracks is trivial (just insert all events into one array).
+Mixing opus-tracks is only slightly harder, but it's common enough
+that a dedicated function is useful.
+'''
+    output_score = [1000, []]
+    for input_track in input_tracks:   # 5.8
+        input_score = opus2score([1000, input_track])
+        for event in input_score[1]:
+            output_score[1].append(event)
+    output_score[1].sort(key=_ticks)
+    output_opus = score2opus(output_score)
+    return output_opus[1]
+def mix_scores(scores):
+    r'''Mixes a list of scores into one one-track score.
+A mixed score cannot be un-mixed.  Hopefully the scores
+have no undesirable channel-conflicts between them.
+If the scores differ in their "ticks" parameter,
+they will all get converted to millisecond-tick format.
+'''
+    input_scores = _consistentise_ticks(scores)  # 3.6
+    output_score = [1000, []]
+    for input_score in input_scores:
+        for input_track in input_score[1:]:
+            output_score[1].extend(input_track)
+    return output_score
+def score2stats(opus_or_score=None):
+    r'''Returns a dict of some basic stats about the score, like
+bank_select (list of tuples (msb,lsb)),
+channels_by_track (list of lists), channels_total (set),
+general_midi_mode (list),
+ntracks, nticks, patch_changes_by_track (list of dicts),
+num_notes_by_channel (list of numbers),
+patch_changes_total (set),
+percussion (dict histogram of channel 9 events),
+pitches (dict histogram of pitches on channels other than 9),
+pitch_range_by_track (list, by track, of two-member-tuples),
+pitch_range_sum (sum over tracks of the pitch_ranges),
+'''
+    bank_select_msb = -1
+    bank_select_lsb = -1
+    bank_select = []
+    channels_by_track = []
+    channels_total    = set([])
+    general_midi_mode = []
+    num_notes_by_channel = dict([])
+    patches_used_by_track  = []
+    patches_used_total     = set([])
+    patch_changes_by_track = []
+    patch_changes_total    = set([])
+    percussion = dict([]) # histogram of channel 9 "pitches"
+    pitches    = dict([]) # histogram of pitch-occurrences channels 0-8,10-15
+    pitch_range_sum = 0   # u pitch-ranges of each track
+    pitch_range_by_track = []
+    is_a_score = True
+    if opus_or_score == None:
+        return {'bank_select':[], 'channels_by_track':[], 'channels_total':[],
+         'general_midi_mode':[], 'ntracks':0, 'nticks':0,
+         'num_notes_by_channel':dict([]),
+         'patch_changes_by_track':[], 'patch_changes_total':[],
+         'percussion':{}, 'pitches':{}, 'pitch_range_by_track':[],
+         'ticks_per_quarter':0, 'pitch_range_sum':0}
+    ticks_per_quarter = opus_or_score[0]
+    i = 1   # ignore first element, which is ticks
+    nticks = 0
+    while i < len(opus_or_score):
+        highest_pitch = 0
+        lowest_pitch = 128
+        channels_this_track = set([])
+        patch_changes_this_track = dict({})
+        for event in opus_or_score[i]:
+            if event[0] == 'note':
+                num_notes_by_channel[event[3]] = num_notes_by_channel.get(event[3],0) + 1
+                if event[3] == 9:
+                    percussion[event[4]] = percussion.get(event[4],0) + 1
+                else:
+                    pitches[event[4]]    = pitches.get(event[4],0) + 1
+                    if event[4] > highest_pitch:
+                        highest_pitch = event[4]
+                    if event[4] < lowest_pitch:
+                        lowest_pitch = event[4]
+                channels_this_track.add(event[3])
+                channels_total.add(event[3])
+                finish_time = event[1] + event[2]
+                if finish_time > nticks:
+                    nticks = finish_time
+            elif event[0] == 'note_off' or (event[0] == 'note_on' and event[4] == 0):  # 4.8
+                finish_time = event[1]
+                if finish_time > nticks:
+                    nticks = finish_time
+            elif event[0] == 'note_on':
+                is_a_score = False
+                num_notes_by_channel[event[2]] = num_notes_by_channel.get(event[2],0) + 1
+                if event[2] == 9:
+                    percussion[event[3]] = percussion.get(event[3],0) + 1
+                else:
+                    pitches[event[3]]    = pitches.get(event[3],0) + 1
+                    if event[3] > highest_pitch:
+                        highest_pitch = event[3]
+                    if event[3] < lowest_pitch:
+                        lowest_pitch = event[3]
+                channels_this_track.add(event[2])
+                channels_total.add(event[2])
+            elif event[0] == 'patch_change':
+                patch_changes_this_track[event[2]] = event[3]
+                patch_changes_total.add(event[3])
+            elif event[0] == 'control_change':
+                if event[3] == 0:  # bank select MSB
+                    bank_select_msb = event[4]
+                elif event[3] == 32:  # bank select LSB
+                    bank_select_lsb = event[4]
+                if bank_select_msb >= 0 and bank_select_lsb >= 0:
+                    bank_select.append((bank_select_msb,bank_select_lsb))
+                    bank_select_msb = -1
+                    bank_select_lsb = -1
+            elif event[0] == 'sysex_f0':
+                if _sysex2midimode.get(event[2], -1) >= 0:
+                    general_midi_mode.append(_sysex2midimode.get(event[2]))
+            if is_a_score:
+                if event[1] > nticks:
+                    nticks = event[1]
+            else:
+                nticks += event[1]
+        if lowest_pitch == 128:
+            lowest_pitch = 0
+        channels_by_track.append(channels_this_track)
+        patch_changes_by_track.append(patch_changes_this_track)
+        pitch_range_by_track.append((lowest_pitch,highest_pitch))
+        pitch_range_sum += (highest_pitch-lowest_pitch)
+        i += 1
+    return {'bank_select':bank_select,
+            'channels_by_track':channels_by_track,
+            'channels_total':channels_total,
+            'general_midi_mode':general_midi_mode,
+            'ntracks':len(opus_or_score)-1,
+            'nticks':nticks,
+            'num_notes_by_channel':num_notes_by_channel,
+            'patch_changes_by_track':patch_changes_by_track,
+            'patch_changes_total':patch_changes_total,
+            'percussion':percussion,
+            'pitches':pitches,
+            'pitch_range_by_track':pitch_range_by_track,
+            'pitch_range_sum':pitch_range_sum,
+            'ticks_per_quarter':ticks_per_quarter}
+#----------------------------- Event stuff --------------------------
+_sysex2midimode = {
+    "\x7E\x7F\x09\x01\xF7": 1,
+    "\x7E\x7F\x09\x02\xF7": 0,
+    "\x7E\x7F\x09\x03\xF7": 2,
+}
+# Some public-access tuples:
+MIDI_events = tuple('''note_off note_on key_after_touch
+control_change patch_change channel_after_touch
+pitch_wheel_change'''.split())
+Text_events = tuple('''text_event copyright_text_event
+track_name instrument_name lyric marker cue_point text_event_08
+text_event_09 text_event_0a text_event_0b text_event_0c
+text_event_0d text_event_0e text_event_0f'''.split())
+Nontext_meta_events = tuple('''end_track set_tempo
+smpte_offset time_signature key_signature sequencer_specific
+raw_meta_event sysex_f0 sysex_f7 song_position song_select
+tune_request'''.split())
+# unsupported: raw_data
+# Actually, 'tune_request' is is F-series event, not strictly a meta-event...
+Meta_events = Text_events + Nontext_meta_events
+All_events  = MIDI_events + Meta_events
+# And three dictionaries:
+Number2patch = {   # General MIDI patch numbers:
+0:'Acoustic Grand',
+1:'Bright Acoustic',
+2:'Electric Grand',
+3:'Honky-Tonk',
+4:'Electric Piano 1',
+5:'Electric Piano 2',
+6:'Harpsichord',
+7:'Clav',
+8:'Celesta',
+9:'Glockenspiel',
+10:'Music Box',
+11:'Vibraphone',
+12:'Marimba',
+13:'Xylophone',
+14:'Tubular Bells',
+15:'Dulcimer',
+16:'Drawbar Organ',
+17:'Percussive Organ',
+18:'Rock Organ',
+19:'Church Organ',
+20:'Reed Organ',
+21:'Accordion',
+22:'Harmonica',
+23:'Tango Accordion',
+24:'Acoustic Guitar(nylon)',
+25:'Acoustic Guitar(steel)',
+26:'Electric Guitar(jazz)',
+27:'Electric Guitar(clean)',
+28:'Electric Guitar(muted)',
+29:'Overdriven Guitar',
+30:'Distortion Guitar',
+31:'Guitar Harmonics',
+32:'Acoustic Bass',
+33:'Electric Bass(finger)',
+34:'Electric Bass(pick)',
+35:'Fretless Bass',
+36:'Slap Bass 1',
+37:'Slap Bass 2',
+38:'Synth Bass 1',
+39:'Synth Bass 2',
+40:'Violin',
+41:'Viola',
+42:'Cello',
+43:'Contrabass',
+44:'Tremolo Strings',
+45:'Pizzicato Strings',
+46:'Orchestral Harp',
+47:'Timpani',
+48:'String Ensemble 1',
+49:'String Ensemble 2',
+50:'SynthStrings 1',
+51:'SynthStrings 2',
+52:'Choir Aahs',
+53:'Voice Oohs',
+54:'Synth Voice',
+55:'Orchestra Hit',
+56:'Trumpet',
+57:'Trombone',
+58:'Tuba',
+59:'Muted Trumpet',
+60:'French Horn',
+61:'Brass Section',
+62:'SynthBrass 1',
+63:'SynthBrass 2',
+64:'Soprano Sax',
+65:'Alto Sax',
+66:'Tenor Sax',
+67:'Baritone Sax',
+68:'Oboe',
+69:'English Horn',
+70:'Bassoon',
+71:'Clarinet',
+72:'Piccolo',
+73:'Flute',
+74:'Recorder',
+75:'Pan Flute',
+76:'Blown Bottle',
+77:'Skakuhachi',
+78:'Whistle',
+79:'Ocarina',
+80:'Lead 1 (square)',
+81:'Lead 2 (sawtooth)',
+82:'Lead 3 (calliope)',
+83:'Lead 4 (chiff)',
+84:'Lead 5 (charang)',
+85:'Lead 6 (voice)',
+86:'Lead 7 (fifths)',
+87:'Lead 8 (bass+lead)',
+88:'Pad 1 (new age)',
+89:'Pad 2 (warm)',
+90:'Pad 3 (polysynth)',
+91:'Pad 4 (choir)',
+92:'Pad 5 (bowed)',
+93:'Pad 6 (metallic)',
+94:'Pad 7 (halo)',
+95:'Pad 8 (sweep)',
+96:'FX 1 (rain)',
+97:'FX 2 (soundtrack)',
+98:'FX 3 (crystal)',
+99:'FX 4 (atmosphere)',
+100:'FX 5 (brightness)',
+101:'FX 6 (goblins)',
+102:'FX 7 (echoes)',
+103:'FX 8 (sci-fi)',
+104:'Sitar',
+105:'Banjo',
+106:'Shamisen',
+107:'Koto',
+108:'Kalimba',
+109:'Bagpipe',
+110:'Fiddle',
+111:'Shanai',
+112:'Tinkle Bell',
+113:'Agogo',
+114:'Steel Drums',
+115:'Woodblock',
+116:'Taiko Drum',
+117:'Melodic Tom',
+118:'Synth Drum',
+119:'Reverse Cymbal',
+120:'Guitar Fret Noise',
+121:'Breath Noise',
+122:'Seashore',
+123:'Bird Tweet',
+124:'Telephone Ring',
+125:'Helicopter',
+126:'Applause',
+127:'Gunshot',
+}
+Notenum2percussion = {   # General MIDI Percussion (on Channel 9):
+35:'Acoustic Bass Drum',
+36:'Bass Drum 1',
+37:'Side Stick',
+38:'Acoustic Snare',
+39:'Hand Clap',
+40:'Electric Snare',
+41:'Low Floor Tom',
+42:'Closed Hi-Hat',
+43:'High Floor Tom',
+44:'Pedal Hi-Hat',
+45:'Low Tom',
+46:'Open Hi-Hat',
+47:'Low-Mid Tom',
+48:'Hi-Mid Tom',
+49:'Crash Cymbal 1',
+50:'High Tom',
+51:'Ride Cymbal 1',
+52:'Chinese Cymbal',
+53:'Ride Bell',
+54:'Tambourine',
+55:'Splash Cymbal',
+56:'Cowbell',
+57:'Crash Cymbal 2',
+58:'Vibraslap',
+59:'Ride Cymbal 2',
+60:'Hi Bongo',
+61:'Low Bongo',
+62:'Mute Hi Conga',
+63:'Open Hi Conga',
+64:'Low Conga',
+65:'High Timbale',
+66:'Low Timbale',
+67:'High Agogo',
+68:'Low Agogo',
+69:'Cabasa',
+70:'Maracas',
+71:'Short Whistle',
+72:'Long Whistle',
+73:'Short Guiro',
+74:'Long Guiro',
+75:'Claves',
+76:'Hi Wood Block',
+77:'Low Wood Block',
+78:'Mute Cuica',
+79:'Open Cuica',
+80:'Mute Triangle',
+81:'Open Triangle',
+}
+Event2channelindex = { 'note':3, 'note_off':2, 'note_on':2,
+ 'key_after_touch':2, 'control_change':2, 'patch_change':2,
+ 'channel_after_touch':2, 'pitch_wheel_change':2
+}
+################################################################
+# The code below this line is full of frightening things, all to
+# do with the actual encoding and decoding of binary MIDI data.
+def _twobytes2int(byte_a):
+    r'''decode a 16 bit quantity from two bytes,'''
+    return (byte_a[1] | (byte_a[0] << 8))
+def _int2twobytes(int_16bit):
+    r'''encode a 16 bit quantity into two bytes,'''
+    return bytes([(int_16bit>>8) & 0xFF, int_16bit & 0xFF])
+def _read_14_bit(byte_a):
+    r'''decode a 14 bit quantity from two bytes,'''
+    return (byte_a[0] | (byte_a[1] << 7))
+def _write_14_bit(int_14bit):
+    r'''encode a 14 bit quantity into two bytes,'''
+    return bytes([int_14bit & 0x7F, (int_14bit>>7) & 0x7F])
+def _ber_compressed_int(integer):
+    r'''BER compressed integer (not an ASN.1 BER, see perlpacktut for
+details).  Its bytes represent an unsigned integer in base 128,
+most significant digit first, with as few digits as possible.
+Bit eight (the high bit) is set on each byte except the last.
+'''
+    ber = bytearray(b'')
+    seven_bits = 0x7F & integer
+    ber.insert(0, seven_bits)  # XXX surely should convert to a char ?
+    integer >>= 7
+    while integer > 0:
+        seven_bits = 0x7F & integer
+        ber.insert(0, 0x80|seven_bits)  # XXX surely should convert to a char ?
+        integer >>= 7
+    return ber
+def _unshift_ber_int(ba):
+    r'''Given a bytearray, returns a tuple of (the ber-integer at the
+start, and the remainder of the bytearray).
+'''
+    if not len(ba):   # 6.7
+        _warn('_unshift_ber_int: no integer found')
+        return ((0, b""))
+    byte = ba.pop(0)
+    integer = 0
+    while True:
+        integer += (byte & 0x7F)
+        if not (byte & 0x80):
+            return ((integer, ba))
+        if not len(ba):
+            _warn('_unshift_ber_int: no end-of-integer found')
+            return ((0, ba))
+        byte = ba.pop(0)
+        integer <<= 7
+def _clean_up_warnings():  # 5.4
+    # Call this before returning from any publicly callable function
+    # whenever there's a possibility that a warning might have been printed
+    # by the function, or by any private functions it might have called.
+    if _no_warning:
+        return
+    global _previous_times
+    global _previous_warning
+    if _previous_times > 1:
+        # E:1176, 0: invalid syntax (<string>, line 1176) (syntax-error) ???
+        # print('  previous message repeated '+str(_previous_times)+' times', file=sys.stderr)
+        # 6.7
+        sys.stderr.write('  previous message repeated {0} times\n'.format(_previous_times))
+    elif _previous_times > 0:
+        sys.stderr.write('  previous message repeated\n')
+    _previous_times = 0
+    _previous_warning = ''
+def _warn(s=''):
+    if _no_warning:
+        return
+    global _previous_times
+    global _previous_warning
+    if s == _previous_warning:  # 5.4
+        _previous_times = _previous_times + 1
+    else:
+        _clean_up_warnings()
+        sys.stderr.write(str(s)+"\n")
+        _previous_warning = s
+def _some_text_event(which_kind=0x01, text=b'some_text'):
+    if str(type(text)).find("'str'") >= 0:   # 6.4 test for back-compatibility
+        data = bytes(text, encoding='ISO-8859-1')
+    else:
+        data = bytes(text)
+    return b'\xFF'+bytes((which_kind,))+_ber_compressed_int(len(data))+data
+def _consistentise_ticks(scores):  # 3.6
+    # used by mix_scores, merge_scores, concatenate_scores
+    if len(scores) == 1:
+         return copy.deepcopy(scores)
+    are_consistent = True
+    ticks = scores[0][0]
+    iscore = 1
+    while iscore < len(scores):
+        if scores[iscore][0] != ticks:
+            are_consistent = False
+            break
+        iscore += 1
+    if are_consistent:
+        return copy.deepcopy(scores)
+    new_scores = []
+    iscore = 0
+    while iscore < len(scores):
+        score = scores[iscore]
+        new_scores.append(opus2score(to_millisecs(score2opus(score))))
+        iscore += 1
+    return new_scores
+###########################################################################
+def _decode(trackdata=b'', exclude=None, include=None,
+ event_callback=None, exclusive_event_callback=None, no_eot_magic=False):
+    r'''Decodes MIDI track data into an opus-style list of events.
+The options:
+  'exclude' is a list of event types which will be ignored SHOULD BE A SET
+  'include' (and no exclude), makes exclude a list
+       of all possible events, /minus/ what include specifies
+  'event_callback' is a coderef
+  'exclusive_event_callback' is a coderef
+'''
+    trackdata = bytearray(trackdata)
+    if exclude == None:
+        exclude = []
+    if include == None:
+        include = []
+    if include and not exclude:
+        exclude = All_events
+    include = set(include)
+    exclude = set(exclude)
+    # Pointer = 0;  not used here; we eat through the bytearray instead.
+    event_code = -1; # used for running status
+    event_count = 0;
+    events = []
+    while(len(trackdata)):
+        # loop while there's anything to analyze ...
+        eot = False   # When True, the event registrar aborts this loop
+        event_count += 1
+        E = []
+        # E for events - we'll feed it to the event registrar at the end.
+        # Slice off the delta time code, and analyze it
+        [time, remainder] = _unshift_ber_int(trackdata)
+        # Now let's see what we can make of the command
+        first_byte = trackdata.pop(0) & 0xFF
+        if (first_byte < 0xF0):  # It's a MIDI event
+            if (first_byte & 0x80):
+                event_code = first_byte
+            else:
+                # It wants running status; use last event_code value
+                trackdata.insert(0, first_byte)
+                if (event_code == -1):
+                    _warn("Running status not set; Aborting track.")
+                    return []
+            command = event_code & 0xF0
+            channel = event_code & 0x0F
+            if (command == 0xF6):  #  0-byte argument
+                pass
+            elif (command == 0xC0 or command == 0xD0):  #  1-byte argument
+                parameter = trackdata.pop(0)  # could be B
+            else: # 2-byte argument could be BB or 14-bit
+                parameter = (trackdata.pop(0), trackdata.pop(0))
+            #################################################################
+            # MIDI events
+            if (command      == 0x80):
+                if 'note_off' in exclude:
+                    continue
+                E = ['note_off', time, channel, parameter[0], parameter[1]]
+            elif (command == 0x90):
+                if 'note_on' in exclude:
+                    continue
+                E = ['note_on', time, channel, parameter[0], parameter[1]]
+            elif (command == 0xA0):
+                if 'key_after_touch' in exclude:
+                    continue
+                E = ['key_after_touch',time,channel,parameter[0],parameter[1]]
+            elif (command == 0xB0):
+                if 'control_change' in exclude:
+                    continue
+                E = ['control_change',time,channel,parameter[0],parameter[1]]
+            elif (command == 0xC0):
+                if 'patch_change' in exclude:
+                    continue
+                E = ['patch_change', time, channel, parameter]
+            elif (command == 0xD0):
+                if 'channel_after_touch' in exclude:
+                    continue
+                E = ['channel_after_touch', time, channel, parameter]
+            elif (command == 0xE0):
+                if 'pitch_wheel_change' in exclude:
+                    continue
+                E = ['pitch_wheel_change', time, channel,
+                 _read_14_bit(parameter)-0x2000]
+            else:
+                _warn("Shouldn't get here; command="+hex(command))
+        elif (first_byte == 0xFF):  # It's a Meta-Event! ##################
+            #[command, length, remainder] =
+            #    unpack("xCwa*", substr(trackdata, $Pointer, 6));
+            #Pointer += 6 - len(remainder);
+            #    # Move past JUST the length-encoded.
+            command = trackdata.pop(0) & 0xFF
+            [length, trackdata] = _unshift_ber_int(trackdata)
+            if (command      == 0x00):
+                 if (length == 2):
+                     E = ['set_sequence_number',time,_twobytes2int(trackdata)]
+                 else:
+                     _warn('set_sequence_number: length must be 2, not '+str(length))
+                     E = ['set_sequence_number', time, 0]
+            elif command >= 0x01 and command <= 0x0f:   # Text events
+                # 6.2 take it in bytes; let the user get the right encoding.
+                # text_str = trackdata[0:length].decode('ascii','ignore')
+                # text_str = trackdata[0:length].decode('ISO-8859-1')
+                # 6.4 take it in bytes; let the user get the right encoding.
+                text_data = bytes(trackdata[0:length])   # 6.4
+                # Defined text events
+                if (command == 0x01):
+                     E = ['text_event', time, text_data]
+                elif (command == 0x02):
+                     E = ['copyright_text_event', time, text_data]
+                elif (command == 0x03):
+                     E = ['track_name', time, text_data]
+                elif (command == 0x04):
+                     E = ['instrument_name', time, text_data]
+                elif (command == 0x05):
+                     E = ['lyric', time, text_data]
+                elif (command == 0x06):
+                     E = ['marker', time, text_data]
+                elif (command == 0x07):
+                     E = ['cue_point', time, text_data]
+                # Reserved but apparently unassigned text events
+                elif (command == 0x08):
+                     E = ['text_event_08', time, text_data]
+                elif (command == 0x09):
+                     E = ['text_event_09', time, text_data]
+                elif (command == 0x0a):
+                     E = ['text_event_0a', time, text_data]
+                elif (command == 0x0b):
+                     E = ['text_event_0b', time, text_data]
+                elif (command == 0x0c):
+                     E = ['text_event_0c', time, text_data]
+                elif (command == 0x0d):
+                     E = ['text_event_0d', time, text_data]
+                elif (command == 0x0e):
+                     E = ['text_event_0e', time, text_data]
+                elif (command == 0x0f):
+                     E = ['text_event_0f', time, text_data]
+            # Now the sticky events -------------------------------------
+            elif (command == 0x2F):
+                 E = ['end_track', time]
+                     # The code for handling this, oddly, comes LATER,
+                     # in the event registrar.
+            elif (command == 0x51): # DTime, Microseconds/Crochet
+                 if length != 3:
+                     _warn('set_tempo event, but length='+str(length))
+                 E = ['set_tempo', time,
+                      struct.unpack(">I", b'\x00'+trackdata[0:3])[0]]
+            elif (command == 0x54):
+                 if length != 5:   # DTime, HR, MN, SE, FR, FF
+                     _warn('smpte_offset event, but length='+str(length))
+                 E = ['smpte_offset',time] + list(struct.unpack(">BBBBB",trackdata[0:5]))
+            elif (command == 0x58):
+                 if length != 4:   # DTime, NN, DD, CC, BB
+                     _warn('time_signature event, but length='+str(length))
+                 E = ['time_signature', time]+list(trackdata[0:4])
+            elif (command == 0x59):
+                 if length != 2:   # DTime, SF(signed), MI
+                     _warn('key_signature event, but length='+str(length))
+                 E = ['key_signature',time] + list(struct.unpack(">bB",trackdata[0:2]))
+            elif (command == 0x7F):   # 6.4
+                 E = ['sequencer_specific',time, bytes(trackdata[0:length])]
+            else:
+                 E = ['raw_meta_event', time, command,
+                   bytes(trackdata[0:length])]   # 6.0
+                 #"[uninterpretable meta-event command of length length]"
+                 # DTime, Command, Binary Data
+                 # It's uninterpretable; record it as raw_data.
+            # Pointer += length; #  Now move Pointer
+            trackdata = trackdata[length:]
+        ######################################################################
+        elif (first_byte == 0xF0 or first_byte == 0xF7):
+            # Note that sysexes in MIDI /files/ are different than sysexes
+            # in MIDI transmissions!! The vast majority of system exclusive
+            # messages will just use the F0 format. For instance, the
+            # transmitted message F0 43 12 00 07 F7 would be stored in a
+            # MIDI file as F0 05 43 12 00 07 F7. As mentioned above, it is
+            # required to include the F7 at the end so that the reader of the
+            # MIDI file knows that it has read the entire message. (But the F7
+            # is omitted if this is a non-final block in a multiblock sysex;
+            # but the F7 (if there) is counted in the message's declared
+            # length, so we don't have to think about it anyway.)
+            #command = trackdata.pop(0)
+            [length, trackdata] = _unshift_ber_int(trackdata)
+            if first_byte == 0xF0:
+                # 20091008 added ISO-8859-1 to get an 8-bit str
+                # 6.4 return bytes instead
+                E = ['sysex_f0', time, bytes(trackdata[0:length])]
+            else:
+                E = ['sysex_f7', time, bytes(trackdata[0:length])]
+            trackdata = trackdata[length:]
+        ######################################################################
+        # Now, the MIDI file spec says:
+        #  <track data> = <MTrk event>+
+        #  <MTrk event> = <delta-time> <event>
+        #  <event> = <MIDI event> | <sysex event> | <meta-event>
+        # I know that, on the wire, <MIDI event> can include note_on,
+        # note_off, and all the other 8x to Ex events, AND Fx events
+        # other than F0, F7, and FF -- namely, <song position msg>,
+        # <song select msg>, and <tune request>.
+        #
+        # Whether these can occur in MIDI files is not clear specified
+        # from the MIDI file spec.  So, I'm going to assume that
+        # they CAN, in practice, occur.  I don't know whether it's
+        # proper for you to actually emit these into a MIDI file.
+        elif (first_byte == 0xF2):   # DTime, Beats
+            #  <song position msg> ::=     F2 <data pair>
+            E = ['song_position', time, _read_14_bit(trackdata[:2])]
+            trackdata = trackdata[2:]
+        elif (first_byte == 0xF3):   # <song select msg> ::= F3 <data singlet>
+            # E = ['song_select', time, struct.unpack('>B',trackdata.pop(0))[0]]
+            E = ['song_select', time, trackdata[0]]
+            trackdata = trackdata[1:]
+            # DTime, Thing (what?! song number?  whatever ...)
+        elif (first_byte == 0xF6):   # DTime
+            E = ['tune_request', time]
+            # What would a tune request be doing in a MIDI /file/?
+        #########################################################
+        # ADD MORE META-EVENTS HERE.  TODO:
+        # f1 -- MTC Quarter Frame Message. One data byte follows
+        #     the Status; it's the time code value, from 0 to 127.
+        # f8 -- MIDI clock.    no data.
+        # fa -- MIDI start.    no data.
+        # fb -- MIDI continue. no data.
+        # fc -- MIDI stop.     no data.
+        # fe -- Active sense.  no data.
+        # f4 f5 f9 fd -- unallocated
+            r'''
+        elif (first_byte > 0xF0) { # Some unknown kinda F-series event ####
+            # Here we only produce a one-byte piece of raw data.
+            # But the encoder for 'raw_data' accepts any length of it.
+            E = [ 'raw_data',
+                         time, substr(trackdata,Pointer,1) ]
+            # DTime and the Data (in this case, the one Event-byte)
+            ++Pointer;  # itself
+'''
+        elif first_byte > 0xF0:  # Some unknown F-series event
+            # Here we only produce a one-byte piece of raw data.
+            # E = ['raw_data', time, bytest(trackdata[0])]   # 6.4
+            E = ['raw_data', time, trackdata[0]]   # 6.4 6.7
+            trackdata = trackdata[1:]
+        else:  # Fallthru.
+            _warn("Aborting track.  Command-byte first_byte="+hex(first_byte))
+            break
+        # End of the big if-group
+        ######################################################################
+        #  THE EVENT REGISTRAR...
+        if E and  (E[0] == 'end_track'):
+            # This is the code for exceptional handling of the EOT event.
+            eot = True
+            if not no_eot_magic:
+                if E[1] > 0:  # a null text-event to carry the delta-time
+                    E = ['text_event', E[1], '']
+                else:
+                    E = []   # EOT with a delta-time of 0; ignore it.
+        if E and not (E[0] in exclude):
+            #if ( $exclusive_event_callback ):
+            #    &{ $exclusive_event_callback }( @E );
+            #else:
+            #    &{ $event_callback }( @E ) if $event_callback;
+                events.append(E)
+        if eot:
+            break
+    # End of the big "Event" while-block
+    return events
+###########################################################################
+def _encode(events_lol, unknown_callback=None, never_add_eot=False,
+  no_eot_magic=False, no_running_status=False):
+    # encode an event structure, presumably for writing to a file
+    # Calling format:
+    #   $data_r = MIDI::Event::encode( \@event_lol, { options } );
+    # Takes a REFERENCE to an event structure (a LoL)
+    # Returns an (unblessed) REFERENCE to track data.
+    # If you want to use this to encode a /single/ event,
+    # you still have to do it as a reference to an event structure (a LoL)
+    # that just happens to have just one event.  I.e.,
+    #   encode( [ $event ] ) or encode( [ [ 'note_on', 100, 5, 42, 64] ] )
+    # If you're doing this, consider the never_add_eot track option, as in
+    #   print MIDI ${ encode( [ $event], { 'never_add_eot' => 1} ) };
+    data = [] # what I'll store the chunks of byte-data in
+    # This is so my end_track magic won't corrupt the original
+    events = copy.deepcopy(events_lol)
+    if not never_add_eot:
+        # One way or another, tack on an 'end_track'
+        if events:
+            last = events[-1]
+            if not (last[0] == 'end_track'):  # no end_track already
+                if (last[0] == 'text_event' and len(last[2]) == 0):
+                    # 0-length text event at track-end.
+                    if no_eot_magic:
+                        # Exceptional case: don't mess with track-final
+                        # 0-length text_events; just peg on an end_track
+                        events.append(['end_track', 0])
+                    else:
+                        # NORMAL CASE: replace with an end_track, leaving DTime
+                        last[0] = 'end_track'
+                else:
+                    # last event was neither 0-length text_event nor end_track
+                    events.append(['end_track', 0])
+        else:  # an eventless track!
+            events = [['end_track', 0],]
+    # maybe_running_status = not no_running_status # unused? 4.7
+    last_status = -1
+    for event_r in (events):
+        E = copy.deepcopy(event_r)
+        # otherwise the shifting'd corrupt the original
+        if not E:
+            continue
+        event = E.pop(0)
+        if not len(event):
+            continue
+        dtime = int(E.pop(0))
+        # print('event='+str(event)+' dtime='+str(dtime))
+        event_data = ''
+        if (   # MIDI events -- eligible for running status
+             event    == 'note_on'
+             or event == 'note_off'
+             or event == 'control_change'
+             or event == 'key_after_touch'
+             or event == 'patch_change'
+             or event == 'channel_after_touch'
+             or event == 'pitch_wheel_change'  ):
+            # This block is where we spend most of the time.  Gotta be tight.
+            if (event == 'note_off'):
+                status = 0x80 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0x7F, int(E[2])&0x7F)
+            elif (event == 'note_on'):
+                status = 0x90 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0x7F, int(E[2])&0x7F)
+            elif (event == 'key_after_touch'):
+                status = 0xA0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0x7F, int(E[2])&0x7F)
+            elif (event == 'control_change'):
+                status = 0xB0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>BB', int(E[1])&0xFF, int(E[2])&0xFF)
+            elif (event == 'patch_change'):
+                status = 0xC0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>B', int(E[1]) & 0xFF)
+            elif (event == 'channel_after_touch'):
+                status = 0xD0 | (int(E[0]) & 0x0F)
+                parameters = struct.pack('>B', int(E[1]) & 0xFF)
+            elif (event == 'pitch_wheel_change'):
+                status = 0xE0 | (int(E[0]) & 0x0F)
+                parameters =  _write_14_bit(int(E[1]) + 0x2000)
+            else:
+                _warn("BADASS FREAKOUT ERROR 31415!")
+            # And now the encoding
+            # w = BER compressed integer (not ASN.1 BER, see perlpacktut for
+            # details).  Its bytes represent an unsigned integer in base 128,
+            # most significant digit first, with as few digits as possible.
+            # Bit eight (the high bit) is set on each byte except the last.
+            data.append(_ber_compressed_int(dtime))
+            if (status != last_status) or no_running_status:
+                data.append(struct.pack('>B', status))
+            data.append(parameters)
+            last_status = status
+            continue
+        else:
+            # Not a MIDI event.
+            # All the code in this block could be more efficient,
+            # but this is not where the code needs to be tight.
+            # print "zaz $event\n";
+            last_status = -1
+            if event == 'raw_meta_event':
+                event_data = _some_text_event(int(E[0]), E[1])
+            elif (event == 'set_sequence_number'):  # 3.9
+                event_data = b'\xFF\x00\x02'+_int2twobytes(E[0])
+            # Text meta-events...
+            # a case for a dict, I think (pjb) ...
+            elif (event == 'text_event'):
+                event_data = _some_text_event(0x01, E[0])
+            elif (event == 'copyright_text_event'):
+                event_data = _some_text_event(0x02, E[0])
+            elif (event == 'track_name'):
+                event_data = _some_text_event(0x03, E[0])
+            elif (event == 'instrument_name'):
+                event_data = _some_text_event(0x04, E[0])
+            elif (event == 'lyric'):
+                event_data = _some_text_event(0x05, E[0])
+            elif (event == 'marker'):
+                event_data = _some_text_event(0x06, E[0])
+            elif (event == 'cue_point'):
+                event_data = _some_text_event(0x07, E[0])
+            elif (event == 'text_event_08'):
+                event_data = _some_text_event(0x08, E[0])
+            elif (event == 'text_event_09'):
+                event_data = _some_text_event(0x09, E[0])
+            elif (event == 'text_event_0a'):
+                event_data = _some_text_event(0x0A, E[0])
+            elif (event == 'text_event_0b'):
+                event_data = _some_text_event(0x0B, E[0])
+            elif (event == 'text_event_0c'):
+                event_data = _some_text_event(0x0C, E[0])
+            elif (event == 'text_event_0d'):
+                event_data = _some_text_event(0x0D, E[0])
+            elif (event == 'text_event_0e'):
+                event_data = _some_text_event(0x0E, E[0])
+            elif (event == 'text_event_0f'):
+                event_data = _some_text_event(0x0F, E[0])
+            # End of text meta-events
+            elif (event == 'end_track'):
+                event_data = b"\xFF\x2F\x00"
+            elif (event == 'set_tempo'):
+                #event_data = struct.pack(">BBwa*", 0xFF, 0x51, 3,
+                #              substr( struct.pack('>I', E[0]), 1, 3))
+                event_data = b'\xFF\x51\x03'+struct.pack('>I',E[0])[1:]
+            elif (event == 'smpte_offset'):
+                # event_data = struct.pack(">BBwBBBBB", 0xFF, 0x54, 5, E[0:5] )
+                event_data = struct.pack(">BBBbBBBB", 0xFF,0x54,0x05,E[0],E[1],E[2],E[3],E[4])
+            elif (event == 'time_signature'):
+                # event_data = struct.pack(">BBwBBBB",  0xFF, 0x58, 4, E[0:4] )
+                event_data = struct.pack(">BBBbBBB", 0xFF, 0x58, 0x04, E[0],E[1],E[2],E[3])
+            elif (event == 'key_signature'):
+                event_data = struct.pack(">BBBbB", 0xFF, 0x59, 0x02, E[0],E[1])
+            elif (event == 'sequencer_specific'):
+                # event_data = struct.pack(">BBwa*", 0xFF,0x7F, len(E[0]), E[0])
+                event_data = _some_text_event(0x7F, E[0])
+            # End of Meta-events
+            # Other Things...
+            elif (event == 'sysex_f0'):
+                 #event_data = struct.pack(">Bwa*", 0xF0, len(E[0]), E[0])
+                 #B=bitstring w=BER-compressed-integer a=null-padded-ascii-str
+                 event_data = bytearray(b'\xF0')+_ber_compressed_int(len(E[0]))+bytearray(E[0])
+            elif (event == 'sysex_f7'):
+                 #event_data = struct.pack(">Bwa*", 0xF7, len(E[0]), E[0])
+                 event_data = bytearray(b'\xF7')+_ber_compressed_int(len(E[0]))+bytearray(E[0])
+            elif (event == 'song_position'):
+                 event_data = b"\xF2" + _write_14_bit( E[0] )
+            elif (event == 'song_select'):
+                 event_data = struct.pack('>BB', 0xF3, E[0] )
+            elif (event == 'tune_request'):
+                 event_data = b"\xF6"
+            elif (event == 'raw_data'):
+                _warn("_encode: raw_data event not supported")
+                # event_data = E[0]
+                continue
+            # End of Other Stuff
+            else:
+                # The Big Fallthru
+                if unknown_callback:
+                    # push(@data, &{ $unknown_callback }( @$event_r ))
+                    pass
+                else:
+                    _warn("Unknown event: "+str(event))
+                    # To surpress complaint here, just set
+                    #  'unknown_callback' => sub { return () }
+                continue
+            #print "Event $event encoded part 2\n"
+            if str(type(event_data)).find("'str'") >= 0:
+                event_data = bytearray(event_data.encode('Latin1', 'ignore'))
+            if len(event_data): # how could $event_data be empty
+                # data.append(struct.pack('>wa*', dtime, event_data))
+                # print(' event_data='+str(event_data))
+                data.append(_ber_compressed_int(dtime)+event_data)
+    return b''.join(data)

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Modified AI Midi Tool Space IAT 360
-emoji: 🐠
-colorFrom: gray
-colorTo: green
 sdk: gradio
-sdk_version: 5.8.0
-app_file: app.py
-pinned: false
-short_description: 'A modified version of the AI midi composer '
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Midi Music Generator
+emoji: 🎼🎶
+colorFrom: red
+colorTo: indigo
 sdk: gradio
+sdk_version: 5.3.0
+app_file: app_onnx.py
+pinned: true
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,533 @@

+import spaces
+import random
+import argparse
+import glob
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+import gradio as gr
+import numpy as np
+import torch
+import torch.nn.functional as F
+import tqdm
+from huggingface_hub import hf_hub_download
+from transformers import DynamicCache
+import MIDI
+from midi_model import MIDIModel, MIDIModelConfig
+from midi_synthesizer import MidiSynthesizer
+MAX_SEED = np.iinfo(np.int32).max
+in_space = os.getenv("SYSTEM") == "spaces"
+@torch.inference_mode()
+def generate(model: MIDIModel, prompt=None, batch_size=1, max_len=512, temp=1.0, top_p=0.98, top_k=20,
+             disable_patch_change=False, disable_control_change=False, disable_channels=None, generator=None):
+    tokenizer = model.tokenizer
+    if disable_channels is not None:
+        disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
+    else:
+        disable_channels = []
+    max_token_seq = tokenizer.max_token_seq
+    if prompt is None:
+        input_tensor = torch.full((1, max_token_seq), tokenizer.pad_id, dtype=torch.long, device=model.device)
+        input_tensor[0, 0] = tokenizer.bos_id  # bos
+        input_tensor = input_tensor.unsqueeze(0)
+        input_tensor = torch.cat([input_tensor] * batch_size, dim=0)
+    else:
+        if len(prompt.shape) == 2:
+            prompt = prompt[None, :]
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif prompt.shape[0] == 1:
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif len(prompt.shape) != 3 or prompt.shape[0] != batch_size:
+            raise ValueError(f"invalid shape for prompt, {prompt.shape}")
+        prompt = prompt[..., :max_token_seq]
+        if prompt.shape[-1] < max_token_seq:
+            prompt = np.pad(prompt, ((0, 0), (0, 0), (0, max_token_seq - prompt.shape[-1])),
+                            mode="constant", constant_values=tokenizer.pad_id)
+        input_tensor = torch.from_numpy(prompt).to(dtype=torch.long, device=model.device)
+    cur_len = input_tensor.shape[1]
+    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len, disable=in_space)
+    cache1 = DynamicCache()
+    past_len = 0
+    with bar:
+        while cur_len < max_len:
+            end = [False] * batch_size
+            hidden = model.forward(input_tensor[:, past_len:], cache=cache1)[:, -1]
+            next_token_seq = None
+            event_names = [""] * batch_size
+            cache2 = DynamicCache()
+            for i in range(max_token_seq):
+                mask = torch.zeros((batch_size, tokenizer.vocab_size), dtype=torch.int64, device=model.device)
+                for b in range(batch_size):
+                    if end[b]:
+                        mask[b, tokenizer.pad_id] = 1
+                        continue
+                    if i == 0:
+                        mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
+                        if disable_patch_change:
+                            mask_ids.remove(tokenizer.event_ids["patch_change"])
+                        if disable_control_change:
+                            mask_ids.remove(tokenizer.event_ids["control_change"])
+                        mask[b, mask_ids] = 1
+                    else:
+                        param_names = tokenizer.events[event_names[b]]
+                        if i > len(param_names):
+                            mask[b, tokenizer.pad_id] = 1
+                            continue
+                        param_name = param_names[i - 1]
+                        mask_ids = tokenizer.parameter_ids[param_name]
+                        if param_name == "channel":
+                            mask_ids = [i for i in mask_ids if i not in disable_channels]
+                        mask[b, mask_ids] = 1
+                mask = mask.unsqueeze(1)
+                x = next_token_seq
+                if i != 0:
+                    hidden = None
+                    x = x[:, -1:]
+                logits = model.forward_token(hidden, x, cache=cache2)[:, -1:]
+                scores = torch.softmax(logits / temp, dim=-1) * mask
+                samples = model.sample_top_p_k(scores, top_p, top_k, generator=generator)
+                if i == 0:
+                    next_token_seq = samples
+                    for b in range(batch_size):
+                        if end[b]:
+                            continue
+                        eid = samples[b].item()
+                        if eid == tokenizer.eos_id:
+                            end[b] = True
+                        else:
+                            event_names[b] = tokenizer.id_events[eid]
+                else:
+                    next_token_seq = torch.cat([next_token_seq, samples], dim=1)
+                    if all([len(tokenizer.events[event_names[b]]) == i for b in range(batch_size) if not end[b]]):
+                        break
+            if next_token_seq.shape[1] < max_token_seq:
+                next_token_seq = F.pad(next_token_seq, (0, max_token_seq - next_token_seq.shape[1]),
+                                       "constant", value=tokenizer.pad_id)
+            next_token_seq = next_token_seq.unsqueeze(1)
+            input_tensor = torch.cat([input_tensor, next_token_seq], dim=1)
+            past_len = cur_len
+            cur_len += 1
+            bar.update(1)
+            yield next_token_seq[:, 0].cpu().numpy()
+            if all(end):
+                break
+def create_msg(name, data):
+    return {"name": name, "data": data}
+def send_msgs(msgs):
+    return json.dumps(msgs)
+def get_duration(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm,
+                 time_sig, key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr,
+                 remove_empty_channels, seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    t = gen_events // 23
+    if "large" in model_name:
+        t = gen_events // 14
+    return t + 5
+@spaces.GPU(duration=get_duration)
+def run(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm, time_sig,
+        key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr, remove_empty_channels,
+        seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    model = models[model_name]
+    model.to(device=opt.device)
+    tokenizer = model.tokenizer
+    bpm = int(bpm)
+    if time_sig == "auto":
+        time_sig = None
+        time_sig_nn = 4
+        time_sig_dd = 2
+    else:
+        time_sig_nn, time_sig_dd = time_sig.split('/')
+        time_sig_nn = int(time_sig_nn)
+        time_sig_dd = {2: 1, 4: 2, 8: 3}[int(time_sig_dd)]
+    if key_sig == 0:
+        key_sig = None
+        key_sig_sf = 0
+        key_sig_mi = 0
+    else:
+        key_sig = (key_sig - 1)
+        key_sig_sf = key_sig // 2 - 7
+        key_sig_mi = key_sig % 2
+    gen_events = int(gen_events)
+    max_len = gen_events
+    if seed_rand:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(opt.device).manual_seed(seed)
+    disable_patch_change = False
+    disable_channels = None
+    if tab == 0:
+        i = 0
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        if tokenizer.version == "v2":
+            if time_sig is not None:
+                mid.append(tokenizer.event2tokens(["time_signature", 0, 0, 0, time_sig_nn - 1, time_sig_dd - 1]))
+            if key_sig is not None:
+                mid.append(tokenizer.event2tokens(["key_signature", 0, 0, 0, key_sig_sf + 7, key_sig_mi]))
+        if bpm != 0:
+            mid.append(tokenizer.event2tokens(["set_tempo", 0, 0, 0, bpm]))
+        patches = {}
+        if instruments is None:
+            instruments = []
+        for instr in instruments:
+            patches[i] = patch2number[instr]
+            i = (i + 1) if i != 8 else 10
+        if drum_kit != "None":
+            patches[9] = drum_kits2number[drum_kit]
+        for i, (c, p) in enumerate(patches.items()):
+            mid.append(tokenizer.event2tokens(["patch_change", 0, 0, i + 1, c, p]))
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+        if len(instruments) > 0:
+            disable_patch_change = True
+            disable_channels = [i for i in range(16) if i not in patches]
+    elif tab == 1 and mid is not None:
+        eps = 4 if reduce_cc_st else 0
+        mid = tokenizer.tokenize(MIDI.midi2score(mid), cc_eps=eps, tempo_eps=eps,
+                                 remap_track_channel=remap_track_channel,
+                                 add_default_instr=add_default_instr,
+                                 remove_empty_channels=remove_empty_channels)
+        mid = mid[:int(midi_events)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    elif tab == 2 and mid_seq is not None:
+        mid = np.asarray(mid_seq, dtype=np.int64)
+        if continuation_select > 0:
+            continuation_state.append(mid_seq)
+            mid = np.repeat(mid[continuation_select - 1:continuation_select], repeats=OUTPUT_BATCH_SIZE, axis=0)
+            mid_seq = mid.tolist()
+        else:
+            continuation_state.append(mid.shape[1])
+    else:
+        continuation_state = [0]
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    if mid is not None:
+        max_len += mid.shape[1]
+    init_msgs = [create_msg("progress", [0, gen_events])]
+    if not (tab == 2 and continuation_select == 0):
+        for i in range(OUTPUT_BATCH_SIZE):
+            events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+            init_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                          create_msg("visualizer_append", [i, events])]
+    yield mid_seq, continuation_state, seed, send_msgs(init_msgs)
+    midi_generator = generate(model, mid, batch_size=OUTPUT_BATCH_SIZE, max_len=max_len, temp=temp,
+                              top_p=top_p, top_k=top_k, disable_patch_change=disable_patch_change,
+                              disable_control_change=not allow_cc, disable_channels=disable_channels,
+                              generator=generator)
+    events = [list() for i in range(OUTPUT_BATCH_SIZE)]
+    t = time.time() + 1
+    for i, token_seqs in enumerate(midi_generator):
+        token_seqs = token_seqs.tolist()
+        for j in range(OUTPUT_BATCH_SIZE):
+            token_seq = token_seqs[j]
+            mid_seq[j].append(token_seq)
+            events[j].append(tokenizer.tokens2event(token_seq))
+        if time.time() - t > 0.5:
+            msgs = [create_msg("progress", [i + 1, gen_events])]
+            for j in range(OUTPUT_BATCH_SIZE):
+                msgs += [create_msg("visualizer_append", [j, events[j]])]
+                events[j] = list()
+            yield mid_seq, continuation_state, seed, send_msgs(msgs)
+            t = time.time()
+    yield mid_seq, continuation_state, seed, send_msgs([])
+def finish_run(model_name, mid_seq):
+    if mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return *outputs, []
+    tokenizer = models[model_name].tokenizer
+    outputs = []
+    end_msgs = [create_msg("progress", [0, 0])]
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        mid = tokenizer.detokenize(mid_seq[i])
+        with open(f"outputs/output{i + 1}.mid", 'wb') as f:
+            f.write(MIDI.score2midi(mid))
+        outputs.append(f"outputs/output{i + 1}.mid")
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return *outputs, send_msgs(end_msgs)
+def synthesis_task(mid):
+    return synthesizer.synthesis(MIDI.score2opus(mid))
+def render_audio(model_name, mid_seq, should_render_audio):
+    if (not should_render_audio) or mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return tuple(outputs)
+    tokenizer = models[model_name].tokenizer
+    outputs = []
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    audio_futures = []
+    for i in range(OUTPUT_BATCH_SIZE):
+        mid = tokenizer.detokenize(mid_seq[i])
+        audio_future = thread_pool.submit(synthesis_task, mid)
+        audio_futures.append(audio_future)
+    for future in audio_futures:
+        outputs.append((44100, future.result()))
+    if OUTPUT_BATCH_SIZE == 1:
+        return outputs[0]
+    return tuple(outputs)
+def undo_continuation(model_name, mid_seq, continuation_state):
+    if mid_seq is None or len(continuation_state) < 2:
+        return mid_seq, continuation_state, send_msgs([])
+    tokenizer = models[model_name].tokenizer
+    if isinstance(continuation_state[-1], list):
+        mid_seq = continuation_state[-1]
+    else:
+        mid_seq = [ms[:continuation_state[-1]] for ms in mid_seq]
+    continuation_state = continuation_state[:-1]
+    end_msgs = [create_msg("progress", [0, 0])]
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return mid_seq, continuation_state, send_msgs(end_msgs)
+def load_javascript(dir="javascript"):
+    scripts_list = glob.glob(f"{dir}/*.js")
+    javascript = ""
+    for path in scripts_list:
+        with open(path, "r", encoding="utf8") as jsfile:
+            js_content = jsfile.read()
+            js_content = js_content.replace("const MIDI_OUTPUT_BATCH_SIZE=4;",
+                                            f"const MIDI_OUTPUT_BATCH_SIZE={OUTPUT_BATCH_SIZE};")
+            javascript += f"\n<!-- {path} --><script>{js_content}</script>"
+    template_response_ori = gr.routes.templates.TemplateResponse
+    def template_response(*args, **kwargs):
+        res = template_response_ori(*args, **kwargs)
+        res.body = res.body.replace(
+            b'</head>', f'{javascript}</head>'.encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+def hf_hub_download_retry(repo_id, filename):
+    print(f"downloading {repo_id} {filename}")
+    retry = 0
+    err = None
+    while retry < 30:
+        try:
+            return hf_hub_download(repo_id=repo_id, filename=filename)
+        except Exception as e:
+            err = e
+            retry += 1
+    if err:
+        raise err
+number2drum_kits = {-1: "None", 0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz",
+                    40: "Blush", 48: "Orchestra"}
+patch2number = {v: k for k, v in MIDI.Number2patch.items()}
+drum_kits2number = {v: k for k, v in number2drum_kits.items()}
+key_signatures = ['C♭', 'A♭m', 'G♭', 'E♭m', 'D♭', 'B♭m', 'A♭', 'Fm', 'E♭', 'Cm', 'B♭', 'Gm', 'F', 'Dm',
+                  'C', 'Am', 'G', 'Em', 'D', 'Bm', 'A', 'F♯m', 'E', 'C♯m', 'B', 'G♯m', 'F♯', 'D♯m', 'C♯', 'A♯m']
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
+    parser.add_argument("--port", type=int, default=7860, help="gradio server port")
+    parser.add_argument("--device", type=str, default="cuda", help="device to run model")
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--max-gen", type=int, default=1024, help="max")
+    opt = parser.parse_args()
+    OUTPUT_BATCH_SIZE = opt.batch
+    soundfont_path = hf_hub_download_retry(repo_id="skytnt/midi-model", filename="soundfont.sf2")
+    thread_pool = ThreadPoolExecutor(max_workers=OUTPUT_BATCH_SIZE)
+    synthesizer = MidiSynthesizer(soundfont_path)
+    models_info = {
+        "generic pretrain model (tv2o-medium) by skytnt": [
+            "skytnt/midi-model-tv2o-medium", {
+                "jpop": "skytnt/midi-model-tv2om-jpop-lora",
+                "touhou": "skytnt/midi-model-tv2om-touhou-lora"
+            }
+        ],
+        "generic pretrain model (tv2o-large) by asigalov61": [
+            "asigalov61/Music-Llama", {}
+        ],
+        "generic pretrain model (tv2o-medium) by asigalov61": [
+            "asigalov61/Music-Llama-Medium", {}
+        ],
+        "generic pretrain model (tv1-medium) by skytnt": [
+            "skytnt/midi-model", {}
+        ]
+    }
+    models = {}
+    if opt.device == "cuda":
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.enable_mem_efficient_sdp(True)
+        torch.backends.cuda.enable_flash_sdp(True)
+    for name, (repo_id, loras) in models_info.items():
+        model = MIDIModel.from_pretrained(repo_id)
+        model.to(device="cpu", dtype=torch.float32)
+        models[name] = model
+        for lora_name, lora_repo in loras.items():
+            model = MIDIModel.from_pretrained(repo_id)
+            print(f"loading lora {lora_repo} for {name}")
+            model = model.load_merge_lora(lora_repo)
+            model.to(device="cpu", dtype=torch.float32)
+            models[f"{name} with {lora_name} lora"] = model
+    load_javascript()
+    app = gr.Blocks(theme=gr.themes.Soft())
+    with app:
+        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Midi Composer</h1>")
+        gr.Markdown("![Visitors](https://api.visitorbadge.io/api/visitors?path=skytnt.midi-composer&style=flat)\n\n"
+                    "Midi event transformer for symbolic music generation\n\n"
+                    "Demo for [SkyTNT/midi-model](https://github.com/SkyTNT/midi-model)\n\n"
+                    "[Open In Colab]"
+                    "(https://colab.research.google.com/github/SkyTNT/midi-model/blob/main/demo.ipynb)"
+                    " or [download windows app](https://github.com/SkyTNT/midi-model/releases)"
+                    " for unlimited generation\n\n"
+                    "**Update v1.3**: MIDITokenizerV2 and new MidiVisualizer\n\n"
+                    "The current **best** model: generic pretrain model (tv2o-medium) by skytnt"
+                    )
+        js_msg = gr.Textbox(elem_id="msg_receiver", visible=False)
+        js_msg.change(None, [js_msg], [], js="""
+        (msg_json) =>{
+            let msgs = JSON.parse(msg_json);
+            executeCallbacks(msgReceiveCallbacks, msgs);
+            return [];
+        }
+        """)
+        input_model = gr.Dropdown(label="select model", choices=list(models.keys()),
+                                  type="value", value=list(models.keys())[0])
+        tab_select = gr.State(value=0)
+        with gr.Tabs():
+            with gr.TabItem("custom prompt") as tab1:
+                input_instruments = gr.Dropdown(label="🪗instruments (auto if empty)", choices=list(patch2number.keys()),
+                                                multiselect=True, max_choices=15, type="value")
+                input_drum_kit = gr.Dropdown(label="🥁drum kit", choices=list(drum_kits2number.keys()), type="value",
+                                             value="None")
+                input_bpm = gr.Slider(label="BPM (beats per minute, auto if 0)", minimum=0, maximum=255,
+                                      step=1,
+                                      value=0)
+                input_time_sig = gr.Radio(label="time signature (only for tv2 models)",
+                                          value="auto",
+                                          choices=["auto", "4/4", "2/4", "3/4", "6/4", "7/4",
+                                                   "2/2", "3/2", "4/2", "3/8", "5/8", "6/8", "7/8", "9/8", "12/8"]
+                                          )
+                input_key_sig = gr.Radio(label="key signature (only for tv2 models)",
+                                         value="auto",
+                                         choices=["auto"] + key_signatures,
+                                         type="index"
+                                         )
+                example1 = gr.Examples([
+                    [[], "None"],
+                    [["Acoustic Grand"], "None"],
+                    [['Acoustic Grand', 'SynthStrings 2', 'SynthStrings 1', 'Pizzicato Strings',
+                      'Pad 2 (warm)', 'Tremolo Strings', 'String Ensemble 1'], "Orchestra"],
+                    [['Trumpet', 'Oboe', 'Trombone', 'String Ensemble 1', 'Clarinet',
+                      'French Horn', 'Pad 4 (choir)', 'Bassoon', 'Flute'], "None"],
+                    [['Flute', 'French Horn', 'Clarinet', 'String Ensemble 2', 'English Horn', 'Bassoon',
+                      'Oboe', 'Pizzicato Strings'], "Orchestra"],
+                    [['Electric Piano 2', 'Lead 5 (charang)', 'Electric Bass(pick)', 'Lead 2 (sawtooth)',
+                      'Pad 1 (new age)', 'Orchestra Hit', 'Cello', 'Electric Guitar(clean)'], "Standard"],
+                    [["Electric Guitar(clean)", "Electric Guitar(muted)", "Overdriven Guitar", "Distortion Guitar",
+                      "Electric Bass(finger)"], "Standard"]
+                ], [input_instruments, input_drum_kit])
+            with gr.TabItem("midi prompt") as tab2:
+                input_midi = gr.File(label="input midi", file_types=[".midi", ".mid"], type="binary")
+                input_midi_events = gr.Slider(label="use first n midi events as prompt", minimum=1, maximum=512,
+                                              step=1,
+                                              value=128)
+                input_reduce_cc_st = gr.Checkbox(label="reduce control_change and set_tempo events", value=True)
+                input_remap_track_channel = gr.Checkbox(
+                    label="remap tracks and channels so each track has only one channel and in order", value=True)
+                input_add_default_instr = gr.Checkbox(
+                    label="add a default instrument to channels that don't have an instrument", value=True)
+                input_remove_empty_channels = gr.Checkbox(label="remove channels without notes", value=False)
+                example2 = gr.Examples([[file, 128] for file in glob.glob("example/*.mid")],
+                                       [input_midi, input_midi_events])
+            with gr.TabItem("last output prompt") as tab3:
+                gr.Markdown("Continue generating on the last output.")
+                input_continuation_select = gr.Radio(label="select output to continue generating", value="all",
+                                                     choices=["all"] + [f"output{i + 1}" for i in
+                                                                        range(OUTPUT_BATCH_SIZE)],
+                                                     type="index"
+                                                     )
+                undo_btn = gr.Button("undo the last continuation")
+        tab1.select(lambda: 0, None, tab_select, queue=False)
+        tab2.select(lambda: 1, None, tab_select, queue=False)
+        tab3.select(lambda: 2, None, tab_select, queue=False)
+        input_seed = gr.Slider(label="seed", minimum=0, maximum=2 ** 31 - 1,
+                               step=1, value=0)
+        input_seed_rand = gr.Checkbox(label="random seed", value=True)
+        input_gen_events = gr.Slider(label="generate max n midi events", minimum=1, maximum=opt.max_gen,
+                                     step=1, value=opt.max_gen // 2)
+        with gr.Accordion("options", open=False):
+            input_temp = gr.Slider(label="temperature", minimum=0.1, maximum=1.2, step=0.01, value=1)
+            input_top_p = gr.Slider(label="top p", minimum=0.1, maximum=1, step=0.01, value=0.95)
+            input_top_k = gr.Slider(label="top k", minimum=1, maximum=128, step=1, value=20)
+            input_allow_cc = gr.Checkbox(label="allow midi cc event", value=True)
+            input_render_audio = gr.Checkbox(label="render audio after generation", value=True)
+            example3 = gr.Examples([[1, 0.94, 128], [1, 0.98, 20], [1, 0.98, 12]],
+                                   [input_temp, input_top_p, input_top_k])
+        run_btn = gr.Button("generate", variant="primary")
+        # stop_btn = gr.Button("stop and output")
+        output_midi_seq = gr.State()
+        output_continuation_state = gr.State([0])
+        midi_outputs = []
+        audio_outputs = []
+        with gr.Tabs(elem_id="output_tabs"):
+            for i in range(OUTPUT_BATCH_SIZE):
+                with gr.TabItem(f"output {i + 1}") as tab1:
+                    output_midi_visualizer = gr.HTML(elem_id=f"midi_visualizer_container_{i}")
+                    output_audio = gr.Audio(label="output audio", format="mp3", elem_id=f"midi_audio_{i}")
+                    output_midi = gr.File(label="output midi", file_types=[".mid"])
+                    midi_outputs.append(output_midi)
+                    audio_outputs.append(output_audio)
+        run_event = run_btn.click(run, [input_model, tab_select, output_midi_seq, output_continuation_state,
+                                        input_continuation_select, input_instruments, input_drum_kit, input_bpm,
+                                        input_time_sig, input_key_sig, input_midi, input_midi_events,
+                                        input_reduce_cc_st, input_remap_track_channel,
+                                        input_add_default_instr, input_remove_empty_channels,
+                                        input_seed, input_seed_rand, input_gen_events, input_temp, input_top_p,
+                                        input_top_k, input_allow_cc],
+                                  [output_midi_seq, output_continuation_state, input_seed, js_msg], queue=True)
+        finish_run_event = run_event.then(fn=finish_run,
+                                          inputs=[input_model, output_midi_seq],
+                                          outputs=midi_outputs + [js_msg],
+                                          queue=False)
+        finish_run_event.then(fn=render_audio,
+                              inputs=[input_model, output_midi_seq, input_render_audio],
+                              outputs=audio_outputs,
+                              queue=False)
+        # stop_btn.click(None, [], [], cancels=run_event,
+        #                queue=False)
+        undo_btn.click(undo_continuation, [input_model, output_midi_seq, output_continuation_state],
+                       [output_midi_seq, output_continuation_state, js_msg], queue=False)
+    app.queue().launch(server_port=opt.port, share=opt.share, inbrowser=True, ssr_mode=False)
+    thread_pool.shutdown()

app_onnx.py ADDED Viewed

	@@ -0,0 +1,625 @@

+import spaces
+import random
+import argparse
+import glob
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+import gradio as gr
+import numpy as np
+import onnxruntime as rt
+import tqdm
+from huggingface_hub import hf_hub_download
+import MIDI
+from midi_synthesizer import MidiSynthesizer
+from midi_tokenizer import MIDITokenizer
+MAX_SEED = np.iinfo(np.int32).max
+in_space = os.getenv("SYSTEM") == "spaces"
+def softmax(x, axis):
+    x_max = np.amax(x, axis=axis, keepdims=True)
+    exp_x_shifted = np.exp(x - x_max)
+    return exp_x_shifted / np.sum(exp_x_shifted, axis=axis, keepdims=True)
+def sample_top_p_k(probs, p, k, generator=None):
+    if generator is None:
+        generator = np.random
+    probs_idx = np.argsort(-probs, axis=-1)
+    probs_sort = np.take_along_axis(probs, probs_idx, -1)
+    probs_sum = np.cumsum(probs_sort, axis=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    mask = np.zeros(probs_sort.shape[-1])
+    mask[:k] = 1
+    probs_sort = probs_sort * mask
+    probs_sort /= np.sum(probs_sort, axis=-1, keepdims=True)
+    shape = probs_sort.shape
+    probs_sort_flat = probs_sort.reshape(-1, shape[-1])
+    probs_idx_flat = probs_idx.reshape(-1, shape[-1])
+    next_token = np.stack([generator.choice(idxs, p=pvals) for pvals, idxs in zip(probs_sort_flat, probs_idx_flat)])
+    next_token = next_token.reshape(*shape[:-1])
+    return next_token
+def apply_io_binding(model: rt.InferenceSession, inputs, outputs, batch_size, past_len, cur_len):
+    io_binding = model.io_binding()
+    for input_ in  model.get_inputs():
+        name = input_.name
+        if name.startswith("past_key_values"):
+            present_name = name.replace("past_key_values", "present")
+            if present_name in outputs:
+                v = outputs[present_name]
+            else:
+                v = rt.OrtValue.ortvalue_from_shape_and_type(
+                    (batch_size, input_.shape[1], past_len, input_.shape[3]),
+                    element_type=np.float32,
+                    device_type=device)
+            inputs[name] = v
+        else:
+            v = inputs[name]
+        io_binding.bind_ortvalue_input(name, v)
+    for output in model.get_outputs():
+        name = output.name
+        if name.startswith("present"):
+            v = rt.OrtValue.ortvalue_from_shape_and_type(
+                (batch_size, output.shape[1], cur_len, output.shape[3]),
+                element_type=np.float32,
+                device_type=device)
+            outputs[name] = v
+        else:
+            v = outputs[name]
+        io_binding.bind_ortvalue_output(name, v)
+    return io_binding
+def generate(model, prompt=None, batch_size=1, max_len=512, temp=1.0, top_p=0.98, top_k=20,
+             disable_patch_change=False, disable_control_change=False, disable_channels=None, generator=None):
+    tokenizer = model[2]
+    if disable_channels is not None:
+        disable_channels = [tokenizer.parameter_ids["channel"][c] for c in disable_channels]
+    else:
+        disable_channels = []
+    if generator is None:
+        generator = np.random
+    max_token_seq = tokenizer.max_token_seq
+    if prompt is None:
+        input_tensor = np.full((1, max_token_seq), tokenizer.pad_id, dtype=np.int64)
+        input_tensor[0, 0] = tokenizer.bos_id  # bos
+        input_tensor = input_tensor[None, :, :]
+        input_tensor = np.repeat(input_tensor, repeats=batch_size, axis=0)
+    else:
+        if len(prompt.shape) == 2:
+            prompt = prompt[None, :]
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif prompt.shape[0] == 1:
+            prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+        elif len(prompt.shape) != 3 or prompt.shape[0] != batch_size:
+            raise ValueError(f"invalid shape for prompt, {prompt.shape}")
+        prompt = prompt[..., :max_token_seq]
+        if prompt.shape[-1] < max_token_seq:
+            prompt = np.pad(prompt, ((0, 0), (0, 0), (0, max_token_seq - prompt.shape[-1])),
+                            mode="constant", constant_values=tokenizer.pad_id)
+        input_tensor = prompt
+    cur_len = input_tensor.shape[1]
+    bar = tqdm.tqdm(desc="generating", total=max_len - cur_len, disable=in_space)
+    model0_inputs = {}
+    model0_outputs = {}
+    emb_size = 1024
+    for output in model[0].get_outputs():
+        if output.name == "hidden":
+            emb_size = output.shape[2]
+    past_len = 0
+    with bar:
+        while cur_len < max_len:
+            end = [False] * batch_size
+            model0_inputs["x"] = rt.OrtValue.ortvalue_from_numpy(input_tensor[:, past_len:], device_type=device)
+            model0_outputs["hidden"] = rt.OrtValue.ortvalue_from_shape_and_type(
+                (batch_size, cur_len - past_len, emb_size),
+                element_type=np.float32,
+                device_type=device)
+            io_binding = apply_io_binding(model[0], model0_inputs, model0_outputs, batch_size, past_len, cur_len)
+            io_binding.synchronize_inputs()
+            model[0].run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+            hidden = model0_outputs["hidden"].numpy()[:, -1:]
+            next_token_seq = np.zeros((batch_size, 0), dtype=np.int64)
+            event_names = [""] * batch_size
+            model1_inputs = {"hidden": rt.OrtValue.ortvalue_from_numpy(hidden, device_type=device)}
+            model1_outputs = {}
+            for i in range(max_token_seq):
+                mask = np.zeros((batch_size, tokenizer.vocab_size), dtype=np.int64)
+                for b in range(batch_size):
+                    if end[b]:
+                        mask[b, tokenizer.pad_id] = 1
+                        continue
+                    if i == 0:
+                        mask_ids = list(tokenizer.event_ids.values()) + [tokenizer.eos_id]
+                        if disable_patch_change:
+                            mask_ids.remove(tokenizer.event_ids["patch_change"])
+                        if disable_control_change:
+                            mask_ids.remove(tokenizer.event_ids["control_change"])
+                        mask[b, mask_ids] = 1
+                    else:
+                        param_names = tokenizer.events[event_names[b]]
+                        if i > len(param_names):
+                            mask[b, tokenizer.pad_id] = 1
+                            continue
+                        param_name = param_names[i - 1]
+                        mask_ids = tokenizer.parameter_ids[param_name]
+                        if param_name == "channel":
+                            mask_ids = [i for i in mask_ids if i not in disable_channels]
+                        mask[b, mask_ids] = 1
+                mask = mask[:, None, :]
+                x = next_token_seq
+                if i != 0:
+                    # cached
+                    if i == 1:
+                        hidden = np.zeros((batch_size, 0, emb_size), dtype=np.float32)
+                        model1_inputs["hidden"] = rt.OrtValue.ortvalue_from_numpy(hidden, device_type=device)
+                    x = x[:, -1:]
+                model1_inputs["x"] = rt.OrtValue.ortvalue_from_numpy(x, device_type=device)
+                model1_outputs["y"] = rt.OrtValue.ortvalue_from_shape_and_type(
+                    (batch_size, 1, tokenizer.vocab_size),
+                    element_type=np.float32,
+                    device_type=device
+                )
+                io_binding = apply_io_binding(model[1], model1_inputs, model1_outputs, batch_size, i, i+1)
+                io_binding.synchronize_inputs()
+                model[1].run_with_iobinding(io_binding)
+                io_binding.synchronize_outputs()
+                logits = model1_outputs["y"].numpy()
+                scores = softmax(logits / temp, -1) * mask
+                samples = sample_top_p_k(scores, top_p, top_k, generator)
+                if i == 0:
+                    next_token_seq = samples
+                    for b in range(batch_size):
+                        if end[b]:
+                            continue
+                        eid = samples[b].item()
+                        if eid == tokenizer.eos_id:
+                            end[b] = True
+                        else:
+                            event_names[b] = tokenizer.id_events[eid]
+                else:
+                    next_token_seq = np.concatenate([next_token_seq, samples], axis=1)
+                    if all([len(tokenizer.events[event_names[b]]) == i for b in range(batch_size) if not end[b]]):
+                        break
+            if next_token_seq.shape[1] < max_token_seq:
+                next_token_seq = np.pad(next_token_seq,
+                                        ((0, 0), (0, max_token_seq - next_token_seq.shape[-1])),
+                                        mode="constant", constant_values=tokenizer.pad_id)
+            next_token_seq = next_token_seq[:, None, :]
+            input_tensor = np.concatenate([input_tensor, next_token_seq], axis=1)
+            past_len = cur_len
+            cur_len += 1
+            bar.update(1)
+            yield next_token_seq[:, 0]
+            if all(end):
+                break
+def create_msg(name, data):
+    return {"name": name, "data": data}
+def send_msgs(msgs):
+    return json.dumps(msgs)
+def get_duration(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm,
+                 time_sig, key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr,
+                 remove_empty_channels, seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    t = gen_events // 28
+    if "large" in model_name:
+        t = gen_events // 20
+    return t + 10
+@spaces.GPU(duration=get_duration)
+def run(model_name, tab, mid_seq, continuation_state, continuation_select, instruments, drum_kit, bpm, time_sig,
+        key_sig, mid, midi_events, reduce_cc_st, remap_track_channel, add_default_instr, remove_empty_channels,
+        seed, seed_rand, gen_events, temp, top_p, top_k, allow_cc):
+    model = models[model_name]
+    model_base = rt.InferenceSession(model[0], providers=providers)
+    model_token = rt.InferenceSession(model[1], providers=providers)
+    tokenizer = model[2]
+    model = [model_base, model_token, tokenizer]
+    bpm = int(bpm)
+    if time_sig == "auto":
+        time_sig = None
+        time_sig_nn = 4
+        time_sig_dd = 2
+    else:
+        time_sig_nn, time_sig_dd = time_sig.split('/')
+        time_sig_nn = int(time_sig_nn)
+        time_sig_dd = {2: 1, 4: 2, 8: 3}[int(time_sig_dd)]
+    if key_sig == 0:
+        key_sig = None
+        key_sig_sf = 0
+        key_sig_mi = 0
+    else:
+        key_sig = (key_sig - 1)
+        key_sig_sf = key_sig // 2 - 7
+        key_sig_mi = key_sig % 2
+    gen_events = int(gen_events)
+    max_len = gen_events
+    if seed_rand:
+        seed = random.randint(0, MAX_SEED)
+    generator = np.random.RandomState(seed)
+    disable_patch_change = False
+    disable_channels = None
+    if tab == 0:
+        i = 0
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        if tokenizer.version == "v2":
+            if time_sig is not None:
+                mid.append(tokenizer.event2tokens(["time_signature", 0, 0, 0, time_sig_nn - 1, time_sig_dd - 1]))
+            if key_sig is not None:
+                mid.append(tokenizer.event2tokens(["key_signature", 0, 0, 0, key_sig_sf + 7, key_sig_mi]))
+        if bpm != 0:
+            mid.append(tokenizer.event2tokens(["set_tempo", 0, 0, 0, bpm]))
+        patches = {}
+        if instruments is None:
+            instruments = []
+        for instr in instruments:
+            patches[i] = patch2number[instr]
+            i = (i + 1) if i != 8 else 10
+        if drum_kit != "None":
+            patches[9] = drum_kits2number[drum_kit]
+        for i, (c, p) in enumerate(patches.items()):
+            mid.append(tokenizer.event2tokens(["patch_change", 0, 0, i + 1, c, p]))
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+        if len(instruments) > 0:
+            disable_patch_change = True
+            disable_channels = [i for i in range(16) if i not in patches]
+    elif tab == 1 and mid is not None:
+        eps = 4 if reduce_cc_st else 0
+        mid = tokenizer.tokenize(MIDI.midi2score(mid), cc_eps=eps, tempo_eps=eps,
+                                 remap_track_channel=remap_track_channel,
+                                 add_default_instr=add_default_instr,
+                                 remove_empty_channels=remove_empty_channels)
+        mid = mid[:int(midi_events)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    elif tab == 2 and mid_seq is not None:
+        mid = np.asarray(mid_seq, dtype=np.int64)
+        if continuation_select > 0:
+            continuation_state.append(mid_seq)
+            mid = np.repeat(mid[continuation_select - 1:continuation_select], repeats=OUTPUT_BATCH_SIZE, axis=0)
+            mid_seq = mid.tolist()
+        else:
+            continuation_state.append(mid.shape[1])
+    else:
+        continuation_state = [0]
+        mid = [[tokenizer.bos_id] + [tokenizer.pad_id] * (tokenizer.max_token_seq - 1)]
+        mid = np.asarray([mid] * OUTPUT_BATCH_SIZE, dtype=np.int64)
+        mid_seq = mid.tolist()
+    if mid is not None:
+        max_len += mid.shape[1]
+    init_msgs = [create_msg("progress", [0, gen_events])]
+    if not (tab == 2 and continuation_select == 0):
+        for i in range(OUTPUT_BATCH_SIZE):
+            events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+            init_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                          create_msg("visualizer_append", [i, events])]
+    yield mid_seq, continuation_state, seed, send_msgs(init_msgs)
+    midi_generator = generate(model, mid, batch_size=OUTPUT_BATCH_SIZE, max_len=max_len, temp=temp,
+                              top_p=top_p, top_k=top_k, disable_patch_change=disable_patch_change,
+                              disable_control_change=not allow_cc, disable_channels=disable_channels,
+                              generator=generator)
+    events = [list() for i in range(OUTPUT_BATCH_SIZE)]
+    t = time.time() + 1
+    for i, token_seqs in enumerate(midi_generator):
+        token_seqs = token_seqs.tolist()
+        for j in range(OUTPUT_BATCH_SIZE):
+            token_seq = token_seqs[j]
+            mid_seq[j].append(token_seq)
+            events[j].append(tokenizer.tokens2event(token_seq))
+        if time.time() - t > 0.5:
+            msgs = [create_msg("progress", [i + 1, gen_events])]
+            for j in range(OUTPUT_BATCH_SIZE):
+                msgs += [create_msg("visualizer_append", [j, events[j]])]
+                events[j] = list()
+            yield mid_seq, continuation_state, seed, send_msgs(msgs)
+            t = time.time()
+    yield mid_seq, continuation_state, seed, send_msgs([])
+def finish_run(model_name, mid_seq):
+    if mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return *outputs, []
+    tokenizer = models[model_name][2]
+    outputs = []
+    end_msgs = [create_msg("progress", [0, 0])]
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        mid = tokenizer.detokenize(mid_seq[i])
+        with open(f"outputs/output{i + 1}.mid", 'wb') as f:
+            f.write(MIDI.score2midi(mid))
+        outputs.append(f"outputs/output{i + 1}.mid")
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return *outputs, send_msgs(end_msgs)
+def synthesis_task(mid):
+    return synthesizer.synthesis(MIDI.score2opus(mid))
+def render_audio(model_name, mid_seq, should_render_audio):
+    if (not should_render_audio) or mid_seq is None:
+        outputs = [None] * OUTPUT_BATCH_SIZE
+        return tuple(outputs)
+    tokenizer = models[model_name][2]
+    outputs = []
+    if not os.path.exists("outputs"):
+        os.mkdir("outputs")
+    audio_futures = []
+    for i in range(OUTPUT_BATCH_SIZE):
+        mid = tokenizer.detokenize(mid_seq[i])
+        audio_future = thread_pool.submit(synthesis_task, mid)
+        audio_futures.append(audio_future)
+    for future in audio_futures:
+        outputs.append((44100, future.result()))
+    if OUTPUT_BATCH_SIZE == 1:
+        return outputs[0]
+    return tuple(outputs)
+def undo_continuation(model_name, mid_seq, continuation_state):
+    if mid_seq is None or len(continuation_state) < 2:
+        return mid_seq, continuation_state, send_msgs([])
+    tokenizer = models[model_name][2]
+    if isinstance(continuation_state[-1], list):
+        mid_seq = continuation_state[-1]
+    else:
+        mid_seq = [ms[:continuation_state[-1]] for ms in mid_seq]
+    continuation_state = continuation_state[:-1]
+    end_msgs = [create_msg("progress", [0, 0])]
+    for i in range(OUTPUT_BATCH_SIZE):
+        events = [tokenizer.tokens2event(tokens) for tokens in mid_seq[i]]
+        end_msgs += [create_msg("visualizer_clear", [i, tokenizer.version]),
+                     create_msg("visualizer_append", [i, events]),
+                     create_msg("visualizer_end", i)]
+    return mid_seq, continuation_state, send_msgs(end_msgs)
+def load_javascript(dir="javascript"):
+    scripts_list = glob.glob(f"{dir}/*.js")
+    javascript = ""
+    for path in scripts_list:
+        with open(path, "r", encoding="utf8") as jsfile:
+            js_content = jsfile.read()
+            js_content = js_content.replace("const MIDI_OUTPUT_BATCH_SIZE=4;",
+                                            f"const MIDI_OUTPUT_BATCH_SIZE={OUTPUT_BATCH_SIZE};")
+            javascript += f"\n<!-- {path} --><script>{js_content}</script>"
+    template_response_ori = gr.routes.templates.TemplateResponse
+    def template_response(*args, **kwargs):
+        res = template_response_ori(*args, **kwargs)
+        res.body = res.body.replace(
+            b'</head>', f'{javascript}</head>'.encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+def hf_hub_download_retry(repo_id, filename):
+    print(f"downloading {repo_id} {filename}")
+    retry = 0
+    err = None
+    while retry < 30:
+        try:
+            return hf_hub_download(repo_id=repo_id, filename=filename)
+        except Exception as e:
+            err = e
+            retry += 1
+    if err:
+        raise err
+def get_tokenizer(repo_id):
+    config_path = hf_hub_download_retry(repo_id=repo_id, filename=f"config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    tokenizer = MIDITokenizer(config["tokenizer"]["version"])
+    tokenizer.set_optimise_midi(config["tokenizer"]["optimise_midi"])
+    return tokenizer
+number2drum_kits = {-1: "None", 0: "Standard", 8: "Room", 16: "Power", 24: "Electric", 25: "TR-808", 32: "Jazz",
+                    40: "Blush", 48: "Orchestra"}
+patch2number = {v: k for k, v in MIDI.Number2patch.items()}
+drum_kits2number = {v: k for k, v in number2drum_kits.items()}
+key_signatures = ['C♭', 'A♭m', 'G♭', 'E♭m', 'D♭', 'B♭m', 'A♭', 'Fm', 'E♭', 'Cm', 'B♭', 'Gm', 'F', 'Dm',
+                  'C', 'Am', 'G', 'Em', 'D', 'Bm', 'A', 'F♯m', 'E', 'C♯m', 'B', 'G♯m', 'F♯', 'D♯m', 'C♯', 'A♯m']
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
+    parser.add_argument("--port", type=int, default=7860, help="gradio server port")
+    parser.add_argument("--device", type=str, default="cuda", help="device to run model")
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--max-gen", type=int, default=1024, help="max")
+    opt = parser.parse_args()
+    OUTPUT_BATCH_SIZE = opt.batch
+    soundfont_path = hf_hub_download_retry(repo_id="skytnt/midi-model", filename="soundfont.sf2")
+    thread_pool = ThreadPoolExecutor(max_workers=OUTPUT_BATCH_SIZE)
+    synthesizer = MidiSynthesizer(soundfont_path)
+    models_info = {
+        "generic pretrain model (tv2o-medium) by skytnt": [
+            "skytnt/midi-model-tv2o-medium", "", {
+                "jpop": "skytnt/midi-model-tv2om-jpop-lora",
+                "touhou": "skytnt/midi-model-tv2om-touhou-lora"
+            }
+        ],
+        "generic pretrain model (tv2o-large) by asigalov61": [
+            "asigalov61/Music-Llama", "", {}
+        ],
+        "generic pretrain model (tv2o-medium) by asigalov61": [
+            "asigalov61/Music-Llama-Medium", "", {}
+        ],
+        "generic pretrain model (tv1-medium) by skytnt": [
+            "skytnt/midi-model", "", {}
+        ]
+    }
+    models = {}
+    providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
+    device = "cuda"
+    for name, (repo_id, path, loras) in models_info.items():
+        model_base_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}onnx/model_base.onnx")
+        model_token_path = hf_hub_download_retry(repo_id=repo_id, filename=f"{path}onnx/model_token.onnx")
+        tokenizer = get_tokenizer(repo_id)
+        models[name] = [model_base_path, model_token_path, tokenizer]
+        for lora_name, lora_repo in loras.items():
+            model_base_path = hf_hub_download_retry(repo_id=lora_repo, filename=f"onnx/model_base.onnx")
+            model_token_path = hf_hub_download_retry(repo_id=lora_repo, filename=f"onnx/model_token.onnx")
+            models[f"{name} with {lora_name} lora"] = [model_base_path, model_token_path, tokenizer]
+    load_javascript()
+    app = gr.Blocks(theme=gr.themes.Soft())
+    with app:
+        gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>Midi Composer</h1>")
+        gr.Markdown("![Visitors](https://api.visitorbadge.io/api/visitors?path=skytnt.midi-composer&style=flat)\n\n"
+                    "Midi event transformer for symbolic music generation\n\n"
+                    "Demo for [SkyTNT/midi-model](https://github.com/SkyTNT/midi-model)\n\n"
+                    "[Open In Colab]"
+                    "(https://colab.research.google.com/github/SkyTNT/midi-model/blob/main/demo.ipynb)"
+                    " or [download windows app](https://github.com/SkyTNT/midi-model/releases)"
+                    " for unlimited generation\n\n"
+                    "**Update v1.3**: MIDITokenizerV2 and new MidiVisualizer\n\n"
+                    "The current **best** model: generic pretrain model (tv2o-medium) by skytnt"
+                    )
+        js_msg = gr.Textbox(elem_id="msg_receiver", visible=False)
+        js_msg.change(None, [js_msg], [], js="""
+        (msg_json) =>{
+            let msgs = JSON.parse(msg_json);
+            executeCallbacks(msgReceiveCallbacks, msgs);
+            return [];
+        }
+        """)
+        input_model = gr.Dropdown(label="select model", choices=list(models.keys()),
+                                  type="value", value=list(models.keys())[0])
+        tab_select = gr.State(value=0)
+        with gr.Tabs():
+            with gr.TabItem("custom prompt") as tab1:
+                input_instruments = gr.Dropdown(label="🪗instruments (auto if empty)", choices=list(patch2number.keys()),
+                                                multiselect=True, max_choices=15, type="value")
+                input_drum_kit = gr.Dropdown(label="🥁drum kit", choices=list(drum_kits2number.keys()), type="value",
+                                             value="None")
+                input_bpm = gr.Slider(label="BPM (beats per minute, auto if 0)", minimum=0, maximum=255,
+                                      step=1,
+                                      value=0)
+                input_time_sig = gr.Radio(label="time signature (only for tv2 models)",
+                                          value="auto",
+                                          choices=["auto", "4/4", "2/4", "3/4", "6/4", "7/4",
+                                                   "2/2", "3/2", "4/2", "3/8", "5/8", "6/8", "7/8", "9/8", "12/8"]
+                                          )
+                input_key_sig = gr.Radio(label="key signature (only for tv2 models)",
+                                         value="auto",
+                                         choices=["auto"] + key_signatures,
+                                         type="index"
+                                         )
+                example1 = gr.Examples([
+                    [[], "None"],
+                    [["Acoustic Grand"], "None"],
+                    [['Acoustic Grand', 'SynthStrings 2', 'SynthStrings 1', 'Pizzicato Strings',
+                      'Pad 2 (warm)', 'Tremolo Strings', 'String Ensemble 1'], "Orchestra"],
+                    [['Trumpet', 'Oboe', 'Trombone', 'String Ensemble 1', 'Clarinet',
+                      'French Horn', 'Pad 4 (choir)', 'Bassoon', 'Flute'], "None"],
+                    [['Flute', 'French Horn', 'Clarinet', 'String Ensemble 2', 'English Horn', 'Bassoon',
+                      'Oboe', 'Pizzicato Strings'], "Orchestra"],
+                    [['Electric Piano 2', 'Lead 5 (charang)', 'Electric Bass(pick)', 'Lead 2 (sawtooth)',
+                      'Pad 1 (new age)', 'Orchestra Hit', 'Cello', 'Electric Guitar(clean)'], "Standard"],
+                    [["Electric Guitar(clean)", "Electric Guitar(muted)", "Overdriven Guitar", "Distortion Guitar",
+                      "Electric Bass(finger)"], "Standard"]
+                ], [input_instruments, input_drum_kit])
+            with gr.TabItem("midi prompt") as tab2:
+                input_midi = gr.File(label="input midi", file_types=[".midi", ".mid"], type="binary")
+                input_midi_events = gr.Slider(label="use first n midi events as prompt", minimum=1, maximum=512,
+                                              step=1,
+                                              value=128)
+                input_reduce_cc_st = gr.Checkbox(label="reduce control_change and set_tempo events", value=True)
+                input_remap_track_channel = gr.Checkbox(
+                    label="remap tracks and channels so each track has only one channel and in order", value=True)
+                input_add_default_instr = gr.Checkbox(
+                    label="add a default instrument to channels that don't have an instrument", value=True)
+                input_remove_empty_channels = gr.Checkbox(label="remove channels without notes", value=False)
+                example2 = gr.Examples([[file, 128] for file in glob.glob("example/*.mid")],
+                                       [input_midi, input_midi_events])
+            with gr.TabItem("last output prompt") as tab3:
+                gr.Markdown("Continue generating on the last output.")
+                input_continuation_select = gr.Radio(label="select output to continue generating", value="all",
+                                                     choices=["all"] + [f"output{i + 1}" for i in
+                                                                        range(OUTPUT_BATCH_SIZE)],
+                                                     type="index"
+                                                     )
+                undo_btn = gr.Button("undo the last continuation")
+        tab1.select(lambda: 0, None, tab_select, queue=False)
+        tab2.select(lambda: 1, None, tab_select, queue=False)
+        tab3.select(lambda: 2, None, tab_select, queue=False)
+        input_seed = gr.Slider(label="seed", minimum=0, maximum=2 ** 31 - 1,
+                               step=1, value=0)
+        input_seed_rand = gr.Checkbox(label="random seed", value=True)
+        input_gen_events = gr.Slider(label="generate max n midi events", minimum=1, maximum=opt.max_gen,
+                                     step=1, value=opt.max_gen // 2)
+        with gr.Accordion("options", open=False):
+            input_temp = gr.Slider(label="temperature", minimum=0.1, maximum=1.2, step=0.01, value=1)
+            input_top_p = gr.Slider(label="top p", minimum=0.1, maximum=1, step=0.01, value=0.95)
+            input_top_k = gr.Slider(label="top k", minimum=1, maximum=128, step=1, value=20)
+            input_allow_cc = gr.Checkbox(label="allow midi cc event", value=True)
+            input_render_audio = gr.Checkbox(label="render audio after generation", value=True)
+            example3 = gr.Examples([[1, 0.94, 128], [1, 0.98, 20], [1, 0.98, 12]],
+                                   [input_temp, input_top_p, input_top_k])
+        run_btn = gr.Button("generate", variant="primary")
+        # stop_btn = gr.Button("stop and output")
+        output_midi_seq = gr.State()
+        output_continuation_state = gr.State([0])
+        midi_outputs = []
+        audio_outputs = []
+        with gr.Tabs(elem_id="output_tabs"):
+            for i in range(OUTPUT_BATCH_SIZE):
+                with gr.TabItem(f"output {i + 1}") as tab1:
+                    output_midi_visualizer = gr.HTML(elem_id=f"midi_visualizer_container_{i}")
+                    output_audio = gr.Audio(label="output audio", format="mp3", elem_id=f"midi_audio_{i}")
+                    output_midi = gr.File(label="output midi", file_types=[".mid"])
+                    midi_outputs.append(output_midi)
+                    audio_outputs.append(output_audio)
+        run_event = run_btn.click(run, [input_model, tab_select, output_midi_seq, output_continuation_state,
+                                        input_continuation_select, input_instruments, input_drum_kit, input_bpm,
+                                        input_time_sig, input_key_sig, input_midi, input_midi_events,
+                                        input_reduce_cc_st, input_remap_track_channel,
+                                        input_add_default_instr, input_remove_empty_channels,
+                                        input_seed, input_seed_rand, input_gen_events, input_temp, input_top_p,
+                                        input_top_k, input_allow_cc],
+                                  [output_midi_seq, output_continuation_state, input_seed, js_msg], queue=True)
+        finish_run_event = run_event.then(fn=finish_run,
+                                          inputs=[input_model, output_midi_seq],
+                                          outputs=midi_outputs + [js_msg],
+                                          queue=False)
+        finish_run_event.then(fn=render_audio,
+                              inputs=[input_model, output_midi_seq, input_render_audio],
+                              outputs=audio_outputs,
+                              queue=False)
+        # stop_btn.click(None, [], [], cancels=run_event,
+        #                queue=False)
+        undo_btn.click(undo_continuation, [input_model, output_midi_seq, output_continuation_state],
+                       [output_midi_seq, output_continuation_state, js_msg], queue=False)
+    app.queue().launch(server_port=opt.port, share=opt.share, inbrowser=True, ssr_mode=False)
+    thread_pool.shutdown()

midi_model.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import json
+from typing import Union, Dict, Any
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import tqdm
+from peft import PeftConfig, LoraModel, load_peft_weights, set_peft_model_state_dict
+from transformers import LlamaModel, LlamaConfig, DynamicCache, PretrainedConfig, PreTrainedModel
+from midi_tokenizer import MIDITokenizerV1, MIDITokenizerV2, MIDITokenizer
+config_name_list = ["tv1-medium", "tv2-medium", "tv2o-medium", "tv2-large", "tv2o-large"]
+class MIDIModelConfig(PretrainedConfig):
+    model_type = "midi_model"
+    def __init__(self,
+                 tokenizer: Union[MIDITokenizerV1, MIDITokenizerV2, Dict]=None,
+                 net_config: Union[LlamaConfig, Dict]=None,
+                 net_token_config: Union[LlamaConfig, Dict]=None,
+                 **kwargs):
+        super().__init__(**kwargs)
+        if tokenizer:
+            if isinstance(tokenizer, dict):
+                self.tokenizer = MIDITokenizer(tokenizer["version"])
+                self.tokenizer.set_optimise_midi(tokenizer["optimise_midi"])
+            else:
+                self.tokenizer = tokenizer
+        else:
+            self.tokenizer = MIDITokenizer()
+        if net_config:
+            if isinstance(net_config, dict):
+                self.net_config = LlamaConfig(**net_config)
+            else:
+                self.net_config = net_config
+        else:
+            self.net_config = LlamaConfig()
+        if net_token_config:
+            if isinstance(net_token_config, dict):
+                self.net_token_config = LlamaConfig(**net_token_config)
+            else:
+                self.net_token_config = net_token_config
+        else:
+            self.net_token_config = LlamaConfig()
+        self.n_embd = self.net_token_config.hidden_size
+    def to_dict(self) -> Dict[str, Any]:
+        d = super().to_dict()
+        d["tokenizer"] = self.tokenizer.to_dict()
+        return d
+    def __str__(self):
+        d = {
+            "net": self.net_config.to_json_string(use_diff=False),
+            "net_token": self.net_token_config.to_json_string(use_diff=False)
+        }
+        return json.dumps(d, indent=4)
+    @staticmethod
+    def get_config(tokenizer_ver="v2", optimise_midi=True, n_layer=12, n_head=16, n_embd=1024, n_inner=4096):
+        tokenizer = MIDITokenizer(tokenizer_ver)
+        tokenizer.set_optimise_midi(optimise_midi)
+        net_config = LlamaConfig(vocab_size=tokenizer.vocab_size,
+                                 hidden_size=n_embd, num_attention_heads=n_head,
+                                 num_hidden_layers=n_layer, intermediate_size=n_inner,
+                                 pad_token_id=tokenizer.pad_id, max_position_embeddings=4096,
+                                 use_cache=False)
+        net_token_config = LlamaConfig(vocab_size=tokenizer.vocab_size,
+                                       hidden_size=n_embd, num_attention_heads=n_head // 4,
+                                       num_hidden_layers=n_layer // 4, intermediate_size=n_inner // 4,
+                                       pad_token_id=tokenizer.pad_id, max_position_embeddings=4096,
+                                       use_cache=False)
+        return MIDIModelConfig(tokenizer, net_config, net_token_config)
+    @staticmethod
+    def from_name(name="tv2o-medium"):
+        tv, size = name.split("-")
+        tv = tv[1:]
+        if tv[-1] == "o":
+            o = True
+            tv = tv[:-1]
+        else:
+            o = False
+        if tv not in ["v1", "v2"]:
+            raise ValueError(f"Unknown tokenizer version {tv}")
+        if size == "medium":
+            return MIDIModelConfig.get_config(tokenizer_ver=tv, optimise_midi=o,
+                                              n_layer=12, n_head=16, n_embd=1024, n_inner=4096)
+        elif size == "large":
+            return MIDIModelConfig.get_config(tokenizer_ver=tv, optimise_midi=o,
+                                              n_layer=24, n_head=16, n_embd=1024, n_inner=4096)
+        else:
+            raise ValueError(f"Unknown model size {size}")
+class MIDIModel(PreTrainedModel):
+    config_class = MIDIModelConfig
+    def __init__(self, config: MIDIModelConfig, *args, **kwargs):
+        super(MIDIModel, self).__init__(config, *args, **kwargs)
+        self.tokenizer = config.tokenizer
+        self.net = LlamaModel(config.net_config)
+        self.net_token = LlamaModel(config.net_token_config)
+        self.lm_head = nn.Linear(config.n_embd, self.tokenizer.vocab_size, bias=False)
+    def load_merge_lora(self, model_id):
+        peft_config = PeftConfig.from_pretrained(model_id)
+        model = LoraModel(self, peft_config, adapter_name="default")
+        adapter_state_dict = load_peft_weights(model_id, device=str(self.device))
+        set_peft_model_state_dict(self, adapter_state_dict, "default")
+        return model.merge_and_unload()
+    def forward_token(self, hidden_state=None, x=None, cache=None):
+        """
+        :param hidden_state: (batch_size, n_embd)
+        :param x: (batch_size, token_sequence_length)
+        :param cache: Cache
+        :return: (batch_size, 1 + token_sequence_length, vocab_size)
+        """
+        if hidden_state is not None:
+            #if you use cache, you don't need to pass in hidden_state
+            hidden_state = hidden_state.unsqueeze(1)  # (batch_size, 1, n_embd)
+        if x is not None:
+            x = self.net_token.embed_tokens(x)
+            if hidden_state is not None:
+                x = torch.cat([hidden_state, x], dim=1)
+            hidden_state = x
+        hidden_state = self.net_token.forward(inputs_embeds=hidden_state,
+                                              past_key_values=cache,
+                                              use_cache=cache is not None).last_hidden_state
+        return self.lm_head(hidden_state)
+    def forward(self, x, cache = None):
+        """
+        :param x: (batch_size, midi_sequence_length, token_sequence_length)
+        :param cache: Cache
+        :return: hidden (batch_size, midi_sequence_length, n_embd)
+        """
+        # merge token sequence
+        x = self.net.embed_tokens(x)
+        x = x.sum(dim=-2)
+        x = self.net.forward(inputs_embeds=x,
+                             past_key_values=cache,
+                             use_cache=cache is not None)
+        return x.last_hidden_state
+    def sample_top_p_k(self, probs, p, k, generator=None):
+        probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+        probs_sum = torch.cumsum(probs_sort, dim=-1)
+        mask = probs_sum - probs_sort > p
+        probs_sort[mask] = 0.0
+        mask = torch.zeros(probs_sort.shape[-1], device=probs_sort.device)
+        mask[:k] = 1
+        probs_sort = probs_sort * mask
+        probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+        shape = probs_sort.shape
+        next_token = torch.multinomial(probs_sort.reshape(-1, shape[-1]),
+                                       num_samples=1, generator=generator).reshape(*shape[:-1], 1)
+        next_token = torch.gather(probs_idx, -1, next_token).reshape(*shape[:-1])
+        return next_token
+    @torch.inference_mode()
+    def generate(self, prompt=None, batch_size=1, max_len=512, temp=1.0, top_p=0.98, top_k=20, generator=None):
+        tokenizer = self.tokenizer
+        max_token_seq = tokenizer.max_token_seq
+        if prompt is None:
+            input_tensor = torch.full((1, max_token_seq), tokenizer.pad_id, dtype=torch.long, device=self.device)
+            input_tensor[0, 0] = tokenizer.bos_id  # bos
+            input_tensor = input_tensor.unsqueeze(0)
+            input_tensor = torch.cat([input_tensor] * batch_size, dim=0)
+        else:
+            if len(prompt.shape) == 2:
+                prompt = prompt[None, :]
+                prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+            elif prompt.shape[0] == 1:
+                prompt = np.repeat(prompt, repeats=batch_size, axis=0)
+            elif len(prompt.shape) != 3 or prompt.shape[0] != batch_size:
+                raise ValueError(f"invalid shape for prompt, {prompt.shape}")
+            prompt = prompt[..., :max_token_seq]
+            if prompt.shape[-1] < max_token_seq:
+                prompt = np.pad(prompt, ((0, 0), (0, 0), (0, max_token_seq - prompt.shape[-1])),
+                                mode="constant", constant_values=tokenizer.pad_id)
+            input_tensor = torch.from_numpy(prompt).to(dtype=torch.long, device=self.device)
+        cur_len = input_tensor.shape[1]
+        bar = tqdm.tqdm(desc="generating", total=max_len - cur_len)
+        cache1 = DynamicCache()
+        past_len = 0
+        with bar:
+            while cur_len < max_len:
+                end = [False] * batch_size
+                hidden = self.forward(input_tensor[:, past_len:], cache=cache1)[:, -1]
+                next_token_seq = None
+                event_names = [""] * batch_size
+                cache2 = DynamicCache()
+                for i in range(max_token_seq):
+                    mask = torch.zeros((batch_size, tokenizer.vocab_size), dtype=torch.int64, device=self.device)
+                    for b in range(batch_size):
+                        if end[b]:
+                            mask[b, tokenizer.pad_id] = 1
+                            continue
+                        if i == 0:
+                            mask[b, list(tokenizer.event_ids.values()) + [tokenizer.eos_id]] = 1
+                        else:
+                            param_names = tokenizer.events[event_names[b]]
+                            if i > len(param_names):
+                                mask[b, tokenizer.pad_id] = 1
+                                continue
+                            mask[b, tokenizer.parameter_ids[param_names[i - 1]]] = 1
+                    mask = mask.unsqueeze(1)
+                    x = next_token_seq
+                    if i != 0:
+                        # cached
+                        hidden = None
+                        x = x[:, -1:]
+                    logits = self.forward_token(hidden, x, cache=cache2)[:, -1:]
+                    scores = torch.softmax(logits / temp, dim=-1) * mask
+                    samples = self.sample_top_p_k(scores, top_p, top_k, generator=generator)
+                    if i == 0:
+                        next_token_seq = samples
+                        for b in range(batch_size):
+                            if end[b]:
+                                continue
+                            eid = samples[b].item()
+                            if eid == tokenizer.eos_id:
+                                end[b] = True
+                            else:
+                                event_names[b] = tokenizer.id_events[eid]
+                    else:
+                        next_token_seq = torch.cat([next_token_seq, samples], dim=1)
+                        if all([len(tokenizer.events[event_names[b]]) == i for b in range(batch_size) if not end[b]]):
+                            break
+                if next_token_seq.shape[1] < max_token_seq:
+                    next_token_seq = F.pad(next_token_seq, (0, max_token_seq - next_token_seq.shape[1]),
+                                           "constant", value=tokenizer.pad_id)
+                next_token_seq = next_token_seq.unsqueeze(1)
+                input_tensor = torch.cat([input_tensor, next_token_seq], dim=1)
+                past_len = cur_len
+                cur_len += 1
+                bar.update(1)
+                if all(end):
+                    break
+        return input_tensor.cpu().numpy()

midi_synthesizer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from threading import Lock
+import fluidsynth
+import numpy as np
+class MidiSynthesizer:
+    def __init__(self, soundfont_path, sample_rate=44100):
+        self.soundfont_path = soundfont_path
+        self.sample_rate = sample_rate
+        fl = fluidsynth.Synth(samplerate=float(sample_rate))
+        sfid = fl.sfload(soundfont_path)
+        self.devices = [[fl, sfid, False]]
+        self.devices_lock = Lock()
+    def get_fluidsynth(self):
+        with self.devices_lock:
+            for device in self.devices:
+                if not device[2]:
+                    device[2] = True
+                    return device
+            fl = fluidsynth.Synth(samplerate=float(self.sample_rate))
+            sfid = fl.sfload(self.soundfont_path)
+            device = [fl, sfid, True]
+            self.devices.append(device)
+            return device
+    def release_fluidsynth(self, device):
+        device[0].system_reset()
+        device[0].get_samples(self.sample_rate*5) # wait for silence
+        device[2] = False
+    def synthesis(self, midi_opus):
+        ticks_per_beat = midi_opus[0]
+        event_list = []
+        for track_idx, track in enumerate(midi_opus[1:]):
+            abs_t = 0
+            for event in track:
+                abs_t += event[1]
+                event_new = [*event]
+                event_new[1] = abs_t
+                event_list.append(event_new)
+        event_list = sorted(event_list, key=lambda e: e[1])
+        tempo = int((60 / 120) * 10 ** 6)  # default 120 bpm
+        ss = np.empty((0, 2), dtype=np.int16)
+        device = self.get_fluidsynth()
+        fl, sfid = device[:-1]
+        last_t = 0
+        for c in range(16):
+            fl.program_select(c, sfid, 128 if c == 9 else 0, 0)
+        for event in event_list:
+            name = event[0]
+            sample_len = int(((event[1] / ticks_per_beat) * tempo / (10 ** 6)) * self.sample_rate)
+            sample_len -= int(((last_t / ticks_per_beat) * tempo / (10 ** 6)) * self.sample_rate)
+            last_t = event[1]
+            if sample_len > 0:
+                sample = fl.get_samples(sample_len).reshape(sample_len, 2)
+                ss = np.concatenate([ss, sample])
+            if name == "set_tempo":
+                tempo = event[2]
+            elif name == "patch_change":
+                c, p = event[2:4]
+                fl.program_select(c, sfid, 128 if c == 9 else 0, p)
+            elif name == "control_change":
+                c, cc, v = event[2:5]
+                fl.cc(c, cc, v)
+            elif name == "note_on" and event[3] > 0:
+                c, p, v = event[2:5]
+                fl.noteon(c, p, v)
+            elif name == "note_off" or (name == "note_on" and event[3] == 0):
+                c, p = event[2:4]
+                fl.noteoff(c, p)
+        self.release_fluidsynth(device)
+        if ss.shape[0] > 0:
+            max_val = np.abs(ss).max()
+            if max_val != 0:
+                ss = (ss / max_val) * np.iinfo(np.int16).max
+        ss = ss.astype(np.int16)
+        return ss

midi_tokenizer.py ADDED Viewed

	@@ -0,0 +1,1196 @@

+import random
+from typing import Dict, Any
+import PIL.Image
+import numpy as np
+class MIDITokenizerV1:
+    def __init__(self):
+        self.version = "v1"
+        self.optimise_midi = False
+        self.vocab_size = 0
+        def allocate_ids(size):
+            ids = [self.vocab_size + i for i in range(size)]
+            self.vocab_size += size
+            return ids
+        self.pad_id = allocate_ids(1)[0]
+        self.bos_id = allocate_ids(1)[0]
+        self.eos_id = allocate_ids(1)[0]
+        self.events = {
+            "note": ["time1", "time2", "track", "duration", "channel", "pitch", "velocity"],
+            "patch_change": ["time1", "time2", "track", "channel", "patch"],
+            "control_change": ["time1", "time2", "track", "channel", "controller", "value"],
+            "set_tempo": ["time1", "time2", "track", "bpm"],
+        }
+        self.event_parameters = {
+            "time1": 128, "time2": 16, "duration": 2048, "track": 128, "channel": 16, "pitch": 128, "velocity": 128,
+            "patch": 128, "controller": 128, "value": 128, "bpm": 256
+        }
+        self.event_ids = {e: allocate_ids(1)[0] for e in self.events.keys()}
+        self.id_events = {i: e for e, i in self.event_ids.items()}
+        self.parameter_ids = {p: allocate_ids(s) for p, s in self.event_parameters.items()}
+        self.max_token_seq = max([len(ps) for ps in self.events.values()]) + 1
+    def to_dict(self) -> Dict[str, Any]:
+        d = {
+            "version":self.version,
+            "optimise_midi":self.optimise_midi,
+            "vocab_size": self.vocab_size,
+            "events": self.events,
+            "event_parameters": self.event_parameters,
+            "max_token_seq": self.max_token_seq,
+            "pad_id": self.pad_id,
+            "bos_id": self.bos_id,
+            "eos_id": self.eos_id,
+        }
+        return d
+    def set_optimise_midi(self, optimise_midi=True):
+        self.optimise_midi = optimise_midi
+    @staticmethod
+    def tempo2bpm(tempo):
+        tempo = tempo / 10 ** 6  # us to s
+        bpm = 60 / tempo
+        return bpm
+    @staticmethod
+    def bpm2tempo(bpm):
+        if bpm == 0:
+            bpm = 1
+        tempo = int((60 / bpm) * 10 ** 6)
+        return tempo
+    def tokenize(self, midi_score, add_bos_eos=True, cc_eps=4, tempo_eps=4,
+                 remap_track_channel=None, add_default_instr=None, remove_empty_channels=None):
+        if remap_track_channel is None:  # set default value
+            remap_track_channel = self.optimise_midi
+        if add_default_instr is None:
+            add_default_instr = self.optimise_midi
+        if remove_empty_channels is None:
+            remove_empty_channels = self.optimise_midi
+        ticks_per_beat = midi_score[0]
+        event_list = {}
+        track_idx_map = {i: dict() for i in range(16)}
+        track_idx_dict = {}
+        channels = []
+        patch_channels = []
+        empty_channels = [True] * 16
+        channel_note_tracks = {i: list() for i in range(16)}
+        for track_idx, track in enumerate(midi_score[1:129]):
+            last_notes = {}
+            patch_dict = {}
+            control_dict = {}
+            last_tempo = 0
+            for event in track:
+                if event[0] not in self.events:
+                    continue
+                c = -1
+                t = round(16 * event[1] / ticks_per_beat)  # quantization
+                new_event = [event[0], t // 16, t % 16, track_idx] + event[2:]
+                if event[0] == "note":
+                    c = event[3]
+                    if c > 15 or c < 0:
+                        continue
+                    empty_channels[c] = False
+                    track_idx_dict.setdefault(c, track_idx)
+                    note_tracks = channel_note_tracks[c]
+                    if track_idx not in note_tracks:
+                        note_tracks.append(track_idx)
+                    new_event[4] = max(1, round(16 * new_event[4] / ticks_per_beat))
+                elif event[0] == "set_tempo":
+                    if new_event[4] == 0:  # invalid tempo
+                        continue
+                    bpm = int(self.tempo2bpm(new_event[4]))
+                    new_event[4] = min(bpm, 255)
+                if event[0] == "note":
+                    key = tuple(new_event[:4] + new_event[5:-1])
+                else:
+                    key = tuple(new_event[:-1])
+                if event[0] == "patch_change":
+                    c, p = event[2:]
+                    if c > 15 or c < 0:
+                        continue
+                    last_p = patch_dict.setdefault(c, None)
+                    if last_p == p:
+                        continue
+                    patch_dict[c] = p
+                    if c not in patch_channels:
+                        patch_channels.append(c)
+                elif event[0] == "control_change":
+                    c, cc, v = event[2:]
+                    if c > 15 or c < 0:
+                        continue
+                    last_v = control_dict.setdefault((c, cc), 0)
+                    if abs(last_v - v) < cc_eps:
+                        continue
+                    control_dict[(c, cc)] = v
+                elif event[0] == "set_tempo":
+                    tempo = new_event[-1]
+                    if abs(last_tempo - tempo) < tempo_eps:
+                        continue
+                    last_tempo = tempo
+                if c != -1:
+                    if c not in channels:
+                        channels.append(c)
+                    tr_map = track_idx_map[c]
+                    if track_idx not in tr_map:
+                        tr_map[track_idx] = 0
+                if event[0] == "note":  # to eliminate note overlap due to quantization
+                    cp = tuple(new_event[5:7])
+                    if cp in last_notes:
+                        last_note_key, last_note = last_notes[cp]
+                        last_t = last_note[1] * 16 + last_note[2]
+                        last_note[4] = max(0, min(last_note[4], t - last_t))
+                        if last_note[4] == 0:
+                            event_list.pop(last_note_key)
+                    last_notes[cp] = (key, new_event)
+                event_list[key] = new_event
+        event_list = list(event_list.values())
+        empty_channels = [c for c in channels if empty_channels[c]]
+        if remap_track_channel:
+            patch_channels = []
+            channels_count = 0
+            channels_map = {9: 9} if 9 in channels else {}
+            if remove_empty_channels:
+                channels = sorted(channels, key=lambda x: 1 if x in empty_channels else 0)
+            for c in channels:
+                if c == 9:
+                    continue
+                channels_map[c] = channels_count
+                channels_count += 1
+                if channels_count == 9:
+                    channels_count = 10
+            channels = list(channels_map.values())
+            track_count = 0
+            track_idx_map_order = [k for k, v in sorted(list(channels_map.items()), key=lambda x: x[1])]
+            for c in track_idx_map_order:  # tracks not to remove
+                if remove_empty_channels and c in empty_channels:
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            for c in track_idx_map_order:  # tracks to remove
+                if not (remove_empty_channels and c in empty_channels):
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if not (len(note_tracks) != 0 and track_idx not in note_tracks):
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            empty_channels = [channels_map[c] for c in empty_channels]
+            track_idx_dict = {}
+            for event in event_list:
+                name = event[0]
+                track_idx = event[3]
+                if name == "note":
+                    c = event[5]
+                    event[5] = channels_map[c]
+                    event[3] = track_idx_map[c][track_idx]
+                    track_idx_dict.setdefault(event[5], event[3])
+                    # setdefault, so the track_idx is first of the channel
+                elif name == "set_tempo":
+                    event[3] = 0
+                elif name == "control_change" or name == "patch_change":
+                    c = event[4]
+                    event[4] = channels_map[c]
+                    tr_map = track_idx_map[c]
+                    # move the event to first track of the channel if it's original track is empty
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        track_idx = channel_note_tracks[c][0]
+                    new_track_idx = tr_map[track_idx]
+                    event[3] = new_track_idx
+                    if name == "patch_change" and event[4] not in patch_channels:
+                        patch_channels.append(event[4])
+        if add_default_instr:
+            for c in channels:
+                if c not in patch_channels and c in track_idx_dict:
+                    event_list.append(["patch_change", 0, 0, track_idx_dict[c], c, 0])
+        events_name_order = {"set_tempo": 0, "patch_change": 1, "control_change": 2, "note": 3}
+        events_order = lambda e: e[1:4] + [events_name_order[e[0]]]
+        event_list = sorted(event_list, key=events_order)
+        setup_events = {}
+        notes_in_setup = False
+        for i, event in enumerate(event_list):  # optimise setup
+            new_event = [*event]
+            if event[0] != "note":
+                new_event[1] = 0
+                new_event[2] = 0
+            has_next = False
+            has_pre = False
+            if i < len(event_list) - 1:
+                next_event = event_list[i + 1]
+                has_next = event[1] + event[2] == next_event[1] + next_event[2]
+            if notes_in_setup and i > 0:
+                pre_event = event_list[i - 1]
+                has_pre = event[1] + event[2] == pre_event[1] + pre_event[2]
+            if (event[0] == "note" and not has_next) or (notes_in_setup and not has_pre):
+                event_list = sorted(setup_events.values(), key=events_order) + event_list[i:]
+                break
+            else:
+                if event[0] == "note":
+                    notes_in_setup = True
+                    key = tuple([event[0]] + event[3:-2])
+                else:
+                    key = tuple([event[0]] + event[3:-1])
+            setup_events[key] = new_event
+        last_t1 = 0
+        midi_seq = []
+        for event in event_list:
+            if remove_empty_channels and event[0] in ["control_change", "patch_change"] and event[4] in empty_channels:
+                continue
+            cur_t1 = event[1]
+            event[1] = event[1] - last_t1
+            tokens = self.event2tokens(event)
+            if not tokens:
+                continue
+            midi_seq.append(tokens)
+            last_t1 = cur_t1
+        if add_bos_eos:
+            bos = [self.bos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            eos = [self.eos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            midi_seq = [bos] + midi_seq + [eos]
+        return midi_seq
+    def event2tokens(self, event):
+        name = event[0]
+        params = event[1:]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        tokens = [self.event_ids[name]] + [self.parameter_ids[p][params[i]]
+                                           for i, p in enumerate(self.events[name])]
+        tokens += [self.pad_id] * (self.max_token_seq - len(tokens))
+        return tokens
+    def tokens2event(self, tokens):
+        if tokens[0] not in self.id_events:
+            return []
+        name = self.id_events[tokens[0]]
+        if len(tokens) <= len(self.events[name]):
+            return []
+        params = tokens[1:]
+        params = [params[i] - self.parameter_ids[p][0] for i, p in enumerate(self.events[name])]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        event = [name] + params
+        return event
+    def detokenize(self, midi_seq):
+        ticks_per_beat = 480
+        tracks_dict = {}
+        t1 = 0
+        for tokens in midi_seq:
+            if tokens[0] in self.id_events:
+                event = self.tokens2event(tokens)
+                if not event:
+                    continue
+                name = event[0]
+                if name == "set_tempo":
+                    event[4] = self.bpm2tempo(event[4])
+                if event[0] == "note":
+                    event[4] = int(event[4] * ticks_per_beat / 16)
+                t1 += event[1]
+                t = t1 * 16 + event[2]
+                t = int(t * ticks_per_beat / 16)
+                track_idx = event[3]
+                if track_idx not in tracks_dict:
+                    tracks_dict[track_idx] = []
+                tracks_dict[track_idx].append([event[0], t] + event[4:])
+        tracks = [tr for idx, tr in sorted(list(tracks_dict.items()), key=lambda it: it[0])]
+        for i in range(len(tracks)):  # to eliminate note overlap
+            track = tracks[i]
+            track = sorted(track, key=lambda e: e[1])
+            last_note_t = {}
+            zero_len_notes = []
+            for e in reversed(track):
+                if e[0] == "note":
+                    t, d, c, p = e[1:5]
+                    key = (c, p)
+                    if key in last_note_t:
+                        d = min(d, max(last_note_t[key] - t, 0))
+                    last_note_t[key] = t
+                    e[2] = d
+                    if d == 0:
+                        zero_len_notes.append(e)
+            for e in zero_len_notes:
+                track.remove(e)
+            tracks[i] = track
+        return [ticks_per_beat, *tracks]
+    def midi2img(self, midi_score):
+        ticks_per_beat = midi_score[0]
+        notes = []
+        max_time = 1
+        track_num = len(midi_score[1:])
+        for track_idx, track in enumerate(midi_score[1:]):
+            for event in track:
+                t = round(16 * event[1] / ticks_per_beat)
+                if event[0] == "note":
+                    d = max(1, round(16 * event[2] / ticks_per_beat))
+                    c, p = event[3:5]
+                    max_time = max(max_time, t + d + 1)
+                    notes.append((track_idx, c, p, t, d))
+        img = np.zeros((128, max_time, 3), dtype=np.uint8)
+        colors = {(i, j): np.random.randint(50, 256, 3) for i in range(track_num) for j in range(16)}
+        for note in notes:
+            tr, c, p, t, d = note
+            img[p, t: t + d] = colors[(tr, c)]
+        img = PIL.Image.fromarray(np.flip(img, 0))
+        return img
+    def augment(self, midi_seq, max_pitch_shift=4, max_vel_shift=10, max_cc_val_shift=10, max_bpm_shift=10,
+                max_track_shift=0, max_channel_shift=16):
+        pitch_shift = random.randint(-max_pitch_shift, max_pitch_shift)
+        vel_shift = random.randint(-max_vel_shift, max_vel_shift)
+        cc_val_shift = random.randint(-max_cc_val_shift, max_cc_val_shift)
+        bpm_shift = random.randint(-max_bpm_shift, max_bpm_shift)
+        track_shift = random.randint(0, max_track_shift)
+        channel_shift = random.randint(0, max_channel_shift)
+        midi_seq_new = []
+        for tokens in midi_seq:
+            tokens_new = [*tokens]
+            if tokens[0] in self.id_events:
+                name = self.id_events[tokens[0]]
+                for i, pn in enumerate(self.events[name]):
+                    if pn == "track":
+                        tr = tokens[1 + i] - self.parameter_ids[pn][0]
+                        tr += track_shift
+                        tr = tr % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][tr]
+                    elif pn == "channel":
+                        c = tokens[1 + i] - self.parameter_ids[pn][0]
+                        c0 = c
+                        c += channel_shift
+                        c = c % self.event_parameters[pn]
+                        if c0 == 9:
+                            c = 9
+                        elif c == 9:
+                            c = (9 + channel_shift) % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][c]
+                if name == "note":
+                    c = tokens[5] - self.parameter_ids["channel"][0]
+                    p = tokens[6] - self.parameter_ids["pitch"][0]
+                    v = tokens[7] - self.parameter_ids["velocity"][0]
+                    if c != 9:  # no shift for drums
+                        p += pitch_shift
+                    if not 0 <= p < 128:
+                        return midi_seq
+                    v += vel_shift
+                    v = max(1, min(127, v))
+                    tokens_new[6] = self.parameter_ids["pitch"][p]
+                    tokens_new[7] = self.parameter_ids["velocity"][v]
+                elif name == "control_change":
+                    cc = tokens[5] - self.parameter_ids["controller"][0]
+                    val = tokens[6] - self.parameter_ids["value"][0]
+                    if cc in [1, 2, 7, 11]:
+                        val += cc_val_shift
+                        val = max(1, min(127, val))
+                    tokens_new[6] = self.parameter_ids["value"][val]
+                elif name == "set_tempo":
+                    bpm = tokens[4] - self.parameter_ids["bpm"][0]
+                    bpm += bpm_shift
+                    bpm = max(1, min(255, bpm))
+                    tokens_new[4] = self.parameter_ids["bpm"][bpm]
+            midi_seq_new.append(tokens_new)
+        return midi_seq_new
+    def check_quality(self, midi_seq, alignment_min=0.3, tonality_min=0.8, piano_max=0.7, notes_bandwidth_min=3,
+                      notes_density_max=50, notes_density_min=2.5, total_notes_max=20000, total_notes_min=256,
+                      note_window_size=16):
+        total_notes = 0
+        channels = []
+        time_hist = [0] * 16
+        note_windows = {}
+        notes_sametime = []
+        notes_density_list = []
+        tonality_list = []
+        notes_bandwidth_list = []
+        instruments = {}
+        piano_channels = []
+        abs_t1 = 0
+        last_t = 0
+        for tsi, tokens in enumerate(midi_seq):
+            event = self.tokens2event(tokens)
+            if not event:
+                continue
+            t1, t2, tr = event[1:4]
+            abs_t1 += t1
+            t = abs_t1 * 16 + t2
+            c = None
+            if event[0] == "note":
+                d, c, p, v = event[4:]
+                total_notes += 1
+                time_hist[t2] += 1
+                if c != 9:  # ignore drum channel
+                    if c not in instruments:
+                        instruments[c] = 0
+                        if c not in piano_channels:
+                            piano_channels.append(c)
+                    note_windows.setdefault(abs_t1 // note_window_size, []).append(p)
+                if last_t != t:
+                    notes_sametime = [(et, p_) for et, p_ in notes_sametime if et > last_t]
+                    notes_sametime_p = [p_ for _, p_ in notes_sametime]
+                    if len(notes_sametime) > 0:
+                        notes_bandwidth_list.append(max(notes_sametime_p) - min(notes_sametime_p))
+                notes_sametime.append((t + d - 1, p))
+            elif event[0] == "patch_change":
+                c, p = event[4:]
+                instruments[c] = p
+                if p == 0 and c not in piano_channels:
+                    piano_channels.append(c)
+            if c is not None and c not in channels:
+                channels.append(c)
+            last_t = t
+        reasons = []
+        if total_notes < total_notes_min:
+            reasons.append("total_min")
+        if total_notes > total_notes_max:
+            reasons.append("total_max")
+        if len(note_windows) == 0 and total_notes > 0:
+            reasons.append("drum_only")
+        if reasons:
+            return False, reasons
+        time_hist = sorted(time_hist, reverse=True)
+        alignment = sum(time_hist[:2]) / total_notes
+        for notes in note_windows.values():
+            key_hist = [0] * 12
+            for p in notes:
+                key_hist[p % 12] += 1
+            key_hist = sorted(key_hist, reverse=True)
+            tonality_list.append(sum(key_hist[:7]) / len(notes))
+            notes_density_list.append(len(notes) / note_window_size)
+        tonality_list = sorted(tonality_list)
+        tonality = sum(tonality_list) / len(tonality_list)
+        notes_bandwidth = sum(notes_bandwidth_list) / len(notes_bandwidth_list) if notes_bandwidth_list else 0
+        notes_density = max(notes_density_list) if notes_density_list else 0
+        piano_ratio = len(piano_channels) / len(channels)
+        if len(channels) <= 3:  # ignore piano threshold if it is a piano solo midi
+            piano_max = 1
+        if alignment < alignment_min:  # check weather the notes align to the bars (because some midi files are recorded)
+            reasons.append("alignment")
+        if tonality < tonality_min:  # check whether the music is tonal
+            reasons.append("tonality")
+        if notes_bandwidth < notes_bandwidth_min:  # check whether music is melodic line only
+            reasons.append("bandwidth")
+        if not notes_density_min < notes_density < notes_density_max:
+            reasons.append("density")
+        if piano_ratio > piano_max:  # check whether most instruments is piano (because some midi files don't have instruments assigned correctly)
+            reasons.append("piano")
+        return not reasons, reasons
+class MIDITokenizerV2:
+    def __init__(self):
+        self.version = "v2"
+        self.optimise_midi = False
+        self.vocab_size = 0
+        def allocate_ids(size):
+            ids = [self.vocab_size + i for i in range(size)]
+            self.vocab_size += size
+            return ids
+        self.pad_id = allocate_ids(1)[0]
+        self.bos_id = allocate_ids(1)[0]
+        self.eos_id = allocate_ids(1)[0]
+        self.events = {
+            "note": ["time1", "time2", "track", "channel", "pitch", "velocity", "duration"],
+            "patch_change": ["time1", "time2", "track", "channel", "patch"],
+            "control_change": ["time1", "time2", "track", "channel", "controller", "value"],
+            "set_tempo": ["time1", "time2", "track", "bpm"],
+            "time_signature": ["time1", "time2", "track", "nn", "dd"],
+            "key_signature": ["time1", "time2", "track", "sf", "mi"],
+        }
+        self.event_parameters = {
+            "time1": 128, "time2": 16, "duration": 2048, "track": 128, "channel": 16, "pitch": 128, "velocity": 128,
+            "patch": 128, "controller": 128, "value": 128, "bpm": 384, "nn": 16, "dd": 4, "sf": 15, "mi": 2
+        }
+        self.event_ids = {e: allocate_ids(1)[0] for e in self.events.keys()}
+        self.id_events = {i: e for e, i in self.event_ids.items()}
+        self.parameter_ids = {p: allocate_ids(s) for p, s in self.event_parameters.items()}
+        self.max_token_seq = max([len(ps) for ps in self.events.values()]) + 1
+    def to_dict(self) -> Dict[str, Any]:
+        d = {
+            "version":self.version,
+            "optimise_midi":self.optimise_midi,
+            "vocab_size": self.vocab_size,
+            "events": self.events,
+            "event_parameters": self.event_parameters,
+            "max_token_seq": self.max_token_seq,
+            "pad_id": self.pad_id,
+            "bos_id": self.bos_id,
+            "eos_id": self.eos_id,
+        }
+        return d
+    def set_optimise_midi(self, optimise_midi=True):
+        self.optimise_midi = optimise_midi
+    @staticmethod
+    def tempo2bpm(tempo):
+        tempo = tempo / 10 ** 6  # us to s
+        bpm = 60 / tempo
+        return bpm
+    @staticmethod
+    def bpm2tempo(bpm):
+        if bpm == 0:
+            bpm = 1
+        tempo = int((60 / bpm) * 10 ** 6)
+        return tempo
+    @staticmethod
+    def sf2key(sf):
+        # sf in key_signature to key.
+        # key represents the sequence from C note to B note (12 in total)
+        return (sf * 7) % 12
+    @staticmethod
+    def key2sf(k, mi):
+        # key to sf
+        sf = (k * 7) % 12
+        if sf > 6 or (mi == 1 and sf >= 5):
+            sf -= 12
+        return sf
+    @staticmethod
+    def detect_key_signature(key_hist, threshold=0.7):
+        if len(key_hist) != 12:
+            return None
+        if sum(key_hist) == 0:
+            return None
+        p = sum(sorted(key_hist, reverse=True)[:7]) / sum(key_hist)
+        if p < threshold:
+            return None
+        keys = [x[1] for x in sorted(zip(key_hist, range(len(key_hist))), reverse=True, key=lambda x: x[0])[:7]]
+        keys = sorted(keys)
+        semitones = []
+        for i in range(len(keys)):
+            dis = keys[i] - keys[i - 1]
+            if dis == 1 or dis == -11:
+                semitones.append(keys[i])
+        if len(semitones) != 2:
+            return None
+        semitones_dis = semitones[1] - semitones[0]
+        if semitones_dis == 5:
+            root_key = semitones[0]
+        elif semitones_dis == 7:
+            root_key = semitones[1]
+        else:
+            return None
+        return root_key
+    def tokenize(self, midi_score, add_bos_eos=True, cc_eps=4, tempo_eps=4,
+                 remap_track_channel=None, add_default_instr=None, remove_empty_channels=None):
+        if remap_track_channel is None:  # set default value
+            remap_track_channel = self.optimise_midi
+        if add_default_instr is None:
+            add_default_instr = self.optimise_midi
+        if remove_empty_channels is None:
+            remove_empty_channels = self.optimise_midi
+        ticks_per_beat = midi_score[0]
+        event_list = {}
+        track_idx_map = {i: dict() for i in range(16)}
+        track_idx_dict = {}
+        channels = []
+        patch_channels = []
+        empty_channels = [True] * 16
+        channel_note_tracks = {i: list() for i in range(16)}
+        note_key_hist = [0]*12
+        key_sigs = []
+        track_to_channels = {}
+        for track_idx, track in enumerate(midi_score[1:129]):
+            last_notes = {}
+            patch_dict = {}
+            control_dict = {}
+            last_bpm = 0
+            track_channels = []
+            track_to_channels.setdefault(track_idx, track_channels)
+            for event in track:
+                if event[0] not in self.events:
+                    continue
+                name = event[0]
+                c = -1
+                t = round(16 * event[1] / ticks_per_beat)  # quantization
+                new_event = [name, t // 16, t % 16, track_idx]
+                if name == "note":
+                    d, c, p, v = event[2:]
+                    if not (0 <= c <= 15):
+                        continue
+                    d = max(1, round(16 * d / ticks_per_beat))
+                    new_event += [c, p, v, d]
+                    empty_channels[c] = False
+                    track_idx_dict.setdefault(c, track_idx)
+                    note_tracks = channel_note_tracks[c]
+                    if track_idx not in note_tracks:
+                        note_tracks.append(track_idx)
+                    if c != 9:
+                        note_key_hist[p%12] += 1
+                    if c not in track_channels:
+                        track_channels.append(c)
+                elif name == "patch_change":
+                    c, p = event[2:]
+                    if not (0 <= c <= 15):
+                        continue
+                    new_event += [c, p]
+                    last_p = patch_dict.setdefault(c, None)
+                    if last_p == p:
+                        continue
+                    patch_dict[c] = p
+                    if c not in patch_channels:
+                        patch_channels.append(c)
+                elif name == "control_change":
+                    c, cc, v = event[2:]
+                    if not (0 <= c <= 15):
+                        continue
+                    new_event += [c, cc, v]
+                    last_v = control_dict.setdefault((c, cc), 0)
+                    if abs(last_v - v) < cc_eps:
+                        continue
+                    control_dict[(c, cc)] = v
+                elif name == "set_tempo":
+                    tempo = event[2]
+                    if tempo == 0:  # invalid tempo
+                        continue
+                    bpm = min(int(self.tempo2bpm(tempo)), 383)
+                    new_event += [bpm]
+                    if abs(last_bpm - bpm) < tempo_eps:
+                        continue
+                    last_bpm = bpm
+                elif name == "time_signature":
+                    nn, dd = event[2:4]
+                    if not (1 <= nn <= 16 and 1 <= dd <= 4):  # invalid
+                        continue
+                    nn -= 1  # make it start from 0
+                    dd -= 1
+                    new_event += [nn, dd]
+                elif name == "key_signature":
+                    sf, mi = event[2:]
+                    if not (-7 <= sf <= 7 and 0 <= mi <= 1):  # invalid
+                        continue
+                    sf += 7
+                    new_event += [sf, mi]
+                    key_sigs.append(new_event)
+                if name in ["note", "time_signature", "key_signature"]:
+                    key = tuple(new_event[:-2])
+                else:
+                    key = tuple(new_event[:-1])
+                if c != -1:
+                    if c not in channels:
+                        channels.append(c)
+                    tr_map = track_idx_map[c]
+                    if track_idx not in tr_map:
+                        tr_map[track_idx] = 0
+                if event[0] == "note":  # to eliminate note overlap due to quantization
+                    cp = tuple(new_event[4:6])  # channel pitch
+                    if cp in last_notes:
+                        last_note_key, last_note = last_notes[cp]
+                        last_t = last_note[1] * 16 + last_note[2]
+                        last_note[-1] = max(0, min(last_note[-1], t - last_t))  # modify duration
+                        if last_note[-1] == 0:
+                            event_list.pop(last_note_key)
+                    last_notes[cp] = (key, new_event)
+                event_list[key] = new_event
+        event_list = list(event_list.values())
+        empty_channels = [c for c in channels if empty_channels[c]]
+        if remap_track_channel:
+            patch_channels = []
+            channels_count = 0
+            channels_map = {9: 9} if 9 in channels else {}
+            if remove_empty_channels:
+                channels = sorted(channels, key=lambda x: 1 if x in empty_channels else 0)
+            for c in channels:
+                if c == 9:
+                    continue
+                channels_map[c] = channels_count
+                channels_count += 1
+                if channels_count == 9:
+                    channels_count = 10
+            channels = list(channels_map.values())
+            track_count = 0
+            track_idx_map_order = [k for k, v in sorted(list(channels_map.items()), key=lambda x: x[1])]
+            for c in track_idx_map_order:  # tracks not to remove
+                if remove_empty_channels and c in empty_channels:
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            for c in track_idx_map_order:  # tracks to remove
+                if not (remove_empty_channels and c in empty_channels):
+                    continue
+                tr_map = track_idx_map[c]
+                for track_idx in tr_map:
+                    note_tracks = channel_note_tracks[c]
+                    if not (len(note_tracks) != 0 and track_idx not in note_tracks):
+                        continue
+                    track_count += 1
+                    tr_map[track_idx] = track_count
+            empty_channels = [channels_map[c] for c in empty_channels]
+            track_idx_dict = {}
+            key_sigs = []
+            key_signature_to_add = []
+            key_signature_to_remove = []
+            for event in event_list:
+                name = event[0]
+                track_idx = event[3]
+                if name == "note":
+                    c = event[4]
+                    event[4] = channels_map[c]  # channel
+                    event[3] = track_idx_map[c][track_idx]  # track
+                    track_idx_dict.setdefault(event[4], event[3])
+                    # setdefault, so the track_idx is first of the channel
+                elif name in ["set_tempo", "time_signature"]:
+                    event[3] = 0  # set track 0 for meta events
+                elif name == "key_signature":
+                    new_channel_track_idxs = []
+                    for c, tr_map in track_idx_map.items():
+                        if track_idx in tr_map:
+                            new_track_idx = tr_map[track_idx]
+                            c = channels_map[c]
+                            new_channel_track_idx = (c, new_track_idx)
+                            if new_track_idx == 0:
+                                continue
+                            if new_channel_track_idx not in new_channel_track_idxs:
+                                new_channel_track_idxs.append(new_channel_track_idx)
+                    if len(new_channel_track_idxs) == 0:
+                        if event[3] == 0: # keep key_signature on track 0 (meta)
+                            key_sigs.append(event)
+                            continue
+                        event[3] = -1 # avoid remove same event
+                        key_signature_to_remove.append(event) # empty track
+                        continue
+                    c, nt = new_channel_track_idxs[0]
+                    event[3] = nt
+                    key_sigs.append(event)
+                    if c == 9:
+                        event[4] = 7 # sf=0
+                    for c, nt in new_channel_track_idxs[1:]:
+                        new_event = [*event]
+                        new_event[3] = nt
+                        if c == 9:
+                            new_event[4] = 7  # sf=0
+                        key_sigs.append(new_event)
+                        key_signature_to_add.append(new_event)
+                elif name == "control_change" or name == "patch_change":
+                    c = event[4]
+                    event[4] = channels_map[c]  # channel
+                    tr_map = track_idx_map[c]
+                    # move the event to first track of the channel if it's original track is empty
+                    note_tracks = channel_note_tracks[c]
+                    if len(note_tracks) != 0 and track_idx not in note_tracks:
+                        track_idx = channel_note_tracks[c][0]
+                    new_track_idx = tr_map[track_idx]
+                    event[3] = new_track_idx
+                    if name == "patch_change" and event[4] not in patch_channels:
+                        patch_channels.append(event[4])
+            for key_sig in key_signature_to_remove:
+                event_list.remove(key_sig)
+            event_list += key_signature_to_add
+            track_to_channels ={}
+            for c, tr_map in track_idx_map.items():
+                if c not in channels_map:
+                    continue
+                c = channels_map[c]
+                for _, track_idx  in tr_map.items():
+                    track_to_channels.setdefault(track_idx, [])
+                    cs = track_to_channels[track_idx]
+                    if c not in cs:
+                        cs.append(c)
+        if add_default_instr:
+            for c in channels:
+                if c not in patch_channels and c in track_idx_dict:
+                    event_list.append(["patch_change", 0, 0, track_idx_dict[c], c, 0])
+        if len(key_sigs) == 0 or all([key_sig[4]==7 for key_sig in key_sigs]):
+            # detect key signature or fix the default key signature
+            root_key = self.detect_key_signature(note_key_hist)
+            if root_key is not None:
+                sf = self.key2sf(root_key, 0)
+                # print("detect_key_signature",sf)
+                if len(key_sigs) == 0:
+                    for tr, cs in track_to_channels.items():
+                        if remap_track_channel and tr == 0:
+                            continue
+                        new_event = ["key_signature", 0, 0, tr, (0 if (len(cs) == 1 and cs[0] == 9) else sf) + 7, 0]
+                        event_list.append(new_event)
+                else:
+                    for key_sig in key_sigs:
+                        tr = key_sig[3]
+                        if tr in track_to_channels:
+                            cs = track_to_channels[tr]
+                            if len(cs) == 1 and cs[0] == 9:
+                                continue
+                        key_sig[4] = sf + 7
+                        key_sig[5] = 0
+            else:
+                # remove default key signature
+                for key_sig in key_sigs:
+                    event_list.remove(key_sig)
+        events_name_order = ["time_signature", "key_signature", "set_tempo", "patch_change", "control_change", "note"]
+        events_name_order = {name: i for i, name in enumerate(events_name_order)}
+        events_order = lambda e: e[1:4] + [events_name_order[e[0]]]
+        event_list = sorted(event_list, key=events_order)
+        setup_events = {}
+        notes_in_setup = False
+        for i, event in enumerate(event_list):  # optimise setup
+            new_event = [*event]  # make copy of event
+            if event[0] not in ["note", "time_signature"]:
+                new_event[1] = 0
+                new_event[2] = 0
+            has_next = False
+            has_pre = False
+            if i < len(event_list) - 1:
+                next_event = event_list[i + 1]
+                has_next = event[1] + event[2] == next_event[1] + next_event[2]
+            if notes_in_setup and i > 0:
+                pre_event = event_list[i - 1]
+                has_pre = event[1] + event[2] == pre_event[1] + pre_event[2]
+            if (event[0] == "note" and not has_next) or (notes_in_setup and not has_pre):
+                event_list = sorted(setup_events.values(), key=events_order) + event_list[i:]
+                break
+            else:
+                if event[0] == "note":
+                    notes_in_setup = True
+                if event[0] in ["note", "time_signature", "key_signature"]:
+                    key = tuple([event[0]]+event[3:-2])
+                else:
+                    key = tuple([event[0]]+event[3:-1])
+            setup_events[key] = new_event
+        last_t1 = 0
+        midi_seq = []
+        for event in event_list:
+            if remove_empty_channels and event[0] in ["control_change", "patch_change"] and event[4] in empty_channels:
+                continue
+            cur_t1 = event[1]
+            event[1] = event[1] - last_t1
+            tokens = self.event2tokens(event)
+            if not tokens:
+                continue
+            midi_seq.append(tokens)
+            last_t1 = cur_t1
+        if add_bos_eos:
+            bos = [self.bos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            eos = [self.eos_id] + [self.pad_id] * (self.max_token_seq - 1)
+            midi_seq = [bos] + midi_seq + [eos]
+        return midi_seq
+    def event2tokens(self, event):
+        name = event[0]
+        params = event[1:]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        tokens = [self.event_ids[name]] + [self.parameter_ids[p][params[i]]
+                                           for i, p in enumerate(self.events[name])]
+        tokens += [self.pad_id] * (self.max_token_seq - len(tokens))
+        return tokens
+    def tokens2event(self, tokens):
+        if tokens[0] not in self.id_events:
+            return []
+        name = self.id_events[tokens[0]]
+        if len(tokens) <= len(self.events[name]):
+            return []
+        params = tokens[1:]
+        params = [params[i] - self.parameter_ids[p][0] for i, p in enumerate(self.events[name])]
+        if not all([0 <= params[i] < self.event_parameters[p] for i, p in enumerate(self.events[name])]):
+            return []
+        event = [name] + params
+        return event
+    def detokenize(self, midi_seq):
+        ticks_per_beat = 480
+        tracks_dict = {}
+        t1 = 0
+        for tokens in midi_seq:
+            if tokens[0] in self.id_events:
+                event = self.tokens2event(tokens)
+                if not event:
+                    continue
+                name = event[0]
+                t1 += event[1]
+                t = t1 * 16 + event[2]
+                t = int(t * ticks_per_beat / 16)
+                track_idx = event[3]
+                event_new = [name, t]
+                if name == "note":
+                    c, p, v, d = event[4:]
+                    d = int(d * ticks_per_beat / 16)
+                    event_new += [d, c, p, v]
+                elif name == "control_change" or name == "patch_change":
+                    event_new += event[4:]
+                elif name == "set_tempo":
+                    event_new += [self.bpm2tempo(event[4])]
+                elif name == "time_signature":
+                    nn, dd = event[4:]
+                    nn += 1
+                    dd += 1
+                    event_new += [nn, dd, 24, 8]  # usually cc, bb = 24, 8
+                elif name == "key_signature":
+                    sf, mi = event[4:]
+                    sf -= 7
+                    event_new += [sf, mi]
+                else:  # should not go here
+                    continue
+                if track_idx not in tracks_dict:
+                    tracks_dict[track_idx] = []
+                tracks_dict[track_idx].append(event_new)
+        tracks = [tr for idx, tr in sorted(list(tracks_dict.items()), key=lambda it: it[0])]
+        for i in range(len(tracks)):  # to eliminate note overlap
+            track = tracks[i]
+            track = sorted(track, key=lambda e: e[1])
+            last_note_t = {}
+            zero_len_notes = []
+            for e in reversed(track):
+                if e[0] == "note":
+                    t, d, c, p = e[1:5]
+                    key = (c, p)
+                    if key in last_note_t:
+                        d = min(d, max(last_note_t[key] - t, 0))
+                    last_note_t[key] = t
+                    e[2] = d
+                    if d == 0:
+                        zero_len_notes.append(e)
+            for e in zero_len_notes:
+                track.remove(e)
+            tracks[i] = track
+        return [ticks_per_beat, *tracks]
+    def midi2img(self, midi_score):
+        ticks_per_beat = midi_score[0]
+        notes = []
+        max_time = 1
+        track_num = len(midi_score[1:])
+        for track_idx, track in enumerate(midi_score[1:]):
+            for event in track:
+                t = round(16 * event[1] / ticks_per_beat)
+                if event[0] == "note":
+                    d = max(1, round(16 * event[2] / ticks_per_beat))
+                    c, p = event[3:5]
+                    max_time = max(max_time, t + d + 1)
+                    notes.append((track_idx, c, p, t, d))
+        img = np.zeros((128, max_time, 3), dtype=np.uint8)
+        colors = {(i, j): np.random.randint(50, 256, 3) for i in range(track_num) for j in range(16)}
+        for note in notes:
+            tr, c, p, t, d = note
+            img[p, t: t + d] = colors[(tr, c)]
+        img = PIL.Image.fromarray(np.flip(img, 0))
+        return img
+    def augment(self, midi_seq, max_pitch_shift=4, max_vel_shift=10, max_cc_val_shift=10, max_bpm_shift=10,
+                max_track_shift=0, max_channel_shift=16):
+        pitch_shift = random.randint(-max_pitch_shift, max_pitch_shift)
+        vel_shift = random.randint(-max_vel_shift, max_vel_shift)
+        cc_val_shift = random.randint(-max_cc_val_shift, max_cc_val_shift)
+        bpm_shift = random.randint(-max_bpm_shift, max_bpm_shift)
+        track_shift = random.randint(0, max_track_shift)
+        channel_shift = random.randint(0, max_channel_shift)
+        midi_seq_new = []
+        key_signature_tokens = []
+        track_to_channels = {}
+        for tokens in midi_seq:
+            tokens_new = [*tokens]
+            if tokens[0] in self.id_events:
+                name = self.id_events[tokens[0]]
+                for i, pn in enumerate(self.events[name]):
+                    if pn == "track":
+                        tr = tokens[1 + i] - self.parameter_ids[pn][0]
+                        tr += track_shift
+                        tr = tr % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][tr]
+                    elif pn == "channel":
+                        c = tokens[1 + i] - self.parameter_ids[pn][0]
+                        c0 = c
+                        c += channel_shift
+                        c = c % self.event_parameters[pn]
+                        if c0 == 9:
+                            c = 9
+                        elif c == 9:
+                            c = (9 + channel_shift) % self.event_parameters[pn]
+                        tokens_new[1 + i] = self.parameter_ids[pn][c]
+                if name == "note":
+                    tr = tokens[3] - self.parameter_ids["track"][0]
+                    c = tokens[4] - self.parameter_ids["channel"][0]
+                    p = tokens[5] - self.parameter_ids["pitch"][0]
+                    v = tokens[6] - self.parameter_ids["velocity"][0]
+                    if c != 9:  # no shift for drums
+                        p += pitch_shift
+                    if not 0 <= p < 128:
+                        return midi_seq
+                    v += vel_shift
+                    v = max(1, min(127, v))
+                    tokens_new[5] = self.parameter_ids["pitch"][p]
+                    tokens_new[6] = self.parameter_ids["velocity"][v]
+                    track_to_channels.setdefault(tr, [])
+                    cs = track_to_channels[tr]
+                    if c not in cs:
+                        cs.append(c)
+                elif name == "control_change":
+                    cc = tokens[5] - self.parameter_ids["controller"][0]
+                    val = tokens[6] - self.parameter_ids["value"][0]
+                    if cc in [1, 2, 7, 11]:
+                        val += cc_val_shift
+                        val = max(1, min(127, val))
+                    tokens_new[6] = self.parameter_ids["value"][val]
+                elif name == "set_tempo":
+                    bpm = tokens[4] - self.parameter_ids["bpm"][0]
+                    bpm += bpm_shift
+                    bpm = max(1, min(383, bpm))
+                    tokens_new[4] = self.parameter_ids["bpm"][bpm]
+                elif name == "key_signature":
+                    sf = tokens[4] - self.parameter_ids["sf"][0]
+                    mi = tokens[5] - self.parameter_ids["mi"][0]
+                    sf -= 7
+                    k = self.sf2key(sf)
+                    k = (k + pitch_shift) % 12
+                    sf = self.key2sf(k, mi)
+                    sf += 7
+                    tokens_new[4] = self.parameter_ids["sf"][sf]
+                    tokens_new[5] = self.parameter_ids["mi"][mi]
+                    key_signature_tokens.append(tokens_new)
+            midi_seq_new.append(tokens_new)
+        for tokens in  key_signature_tokens:
+            tr = tokens[3] - self.parameter_ids["track"][0]
+            if tr in track_to_channels:
+                cs = track_to_channels[tr]
+                if len(cs) == 1 and cs[0] == 9:
+                    tokens[4] = self.parameter_ids["sf"][7] # sf=0
+        return midi_seq_new
+    def check_quality(self, midi_seq, alignment_min=0.3, tonality_min=0.8, piano_max=0.7, notes_bandwidth_min=3,
+                      notes_density_max=50, notes_density_min=2.5, total_notes_max=20000, total_notes_min=256,
+                      note_window_size=16):
+        total_notes = 0
+        channels = []
+        time_hist = [0] * 16
+        note_windows = {}
+        notes_sametime = []
+        notes_density_list = []
+        tonality_list = []
+        notes_bandwidth_list = []
+        instruments = {}
+        piano_channels = []
+        abs_t1 = 0
+        last_t = 0
+        for tsi, tokens in enumerate(midi_seq):
+            event = self.tokens2event(tokens)
+            if not event:
+                continue
+            t1, t2, tr = event[1:4]
+            abs_t1 += t1
+            t = abs_t1 * 16 + t2
+            c = None
+            if event[0] == "note":
+                c, p, v, d = event[4:]
+                total_notes += 1
+                time_hist[t2] += 1
+                if c != 9:  # ignore drum channel
+                    if c not in instruments:
+                        instruments[c] = 0
+                        if c not in piano_channels:
+                            piano_channels.append(c)
+                    note_windows.setdefault(abs_t1 // note_window_size, []).append(p)
+                if last_t != t:
+                    notes_sametime = [(et, p_) for et, p_ in notes_sametime if et > last_t]
+                    notes_sametime_p = [p_ for _, p_ in notes_sametime]
+                    if len(notes_sametime) > 0:
+                        notes_bandwidth_list.append(max(notes_sametime_p) - min(notes_sametime_p))
+                notes_sametime.append((t + d - 1, p))
+            elif event[0] == "patch_change":
+                c, p = event[4:]
+                instruments[c] = p
+                if p == 0 and c not in piano_channels:
+                    piano_channels.append(c)
+            if c is not None and c not in channels:
+                channels.append(c)
+            last_t = t
+        reasons = []
+        if total_notes < total_notes_min:
+            reasons.append("total_min")
+        if total_notes > total_notes_max:
+            reasons.append("total_max")
+        if len(note_windows) == 0 and total_notes > 0:
+            reasons.append("drum_only")
+        if reasons:
+            return False, reasons
+        time_hist = sorted(time_hist, reverse=True)
+        alignment = sum(time_hist[:2]) / total_notes
+        for notes in note_windows.values():
+            key_hist = [0] * 12
+            for p in notes:
+                key_hist[p % 12] += 1
+            key_hist = sorted(key_hist, reverse=True)
+            tonality_list.append(sum(key_hist[:7]) / len(notes))
+            notes_density_list.append(len(notes) / note_window_size)
+        tonality_list = sorted(tonality_list)
+        tonality = sum(tonality_list) / len(tonality_list)
+        notes_bandwidth = sum(notes_bandwidth_list) / len(notes_bandwidth_list) if notes_bandwidth_list else 0
+        notes_density = max(notes_density_list) if notes_density_list else 0
+        piano_ratio = len(piano_channels) / len(channels)
+        if len(channels) <= 3:  # ignore piano threshold if it is a piano solo midi
+            piano_max = 1
+        if alignment < alignment_min:  # check weather the notes align to the bars (because some midi files are recorded)
+            reasons.append("alignment")
+        if tonality < tonality_min:  # check whether the music is tonal
+            reasons.append("tonality")
+        if notes_bandwidth < notes_bandwidth_min:  # check whether music is melodic line only
+            reasons.append("bandwidth")
+        if not notes_density_min < notes_density < notes_density_max:
+            reasons.append("density")
+        if piano_ratio > piano_max:  # check whether most instruments is piano (because some midi files don't have instruments assigned correctly)
+            reasons.append("piano")
+        return not reasons, reasons
+class MIDITokenizer:
+    def __new__(cls, version="v2"):
+        if version == "v1":
+            return MIDITokenizerV1()
+        elif version == "v2":
+            return MIDITokenizerV2()
+        else:
+            raise ValueError(f"Unsupported version: {version}")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ fluidsynth

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+--extra-index-url https://download.pytorch.org/whl/cu124
+Pillow
+numpy
+torch
+onnxruntime-gpu
+peft>=0.13.0
+transformers>=4.36
+gradio==5.3.0
+pyfluidsynth
+tqdm
+huggingface_hub