shoogle · drewfustin · Jun 24, 2020
diff --git a/midicsv_process_python3.py b/midicsv_process_python3.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+With [midicsv](https://www.fourmilab.ch/webtools/midicsv/) installed to $PATH,
+parse MIDI file and format it into CSV useable for c82.net's
+[Off the Staff](https://c82.net/blog/?id=75)
+Usage (from command line):
+python midicsv_process_python3 MID_OR_CSV_FILENAME OUTPUT_CSV_FILENAME
+"""
+import argparse
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import pathlib
+import subprocess
+from typing import Dict, Sequence
+
+NOTELETTERS = ["C","C","D","D","E","F","F","G","G","A","A","B"]
+SHARPS = [ "","#", "","#", "", "","#", "","#", "","#", ""]
+
+
+def open_file(input_file: str) -> pd.DataFrame:
+    """
+    Open the input_file and format it into a pandas DataFrame
+    If input_file's extension is .mid/.midi/.kar, run it through midicsv to translate to .csv
+    If input_file's extension is .csv, load it into the DataFrame
+    """
+    filepath: pathlib.Path = pathlib.Path(input_file).expanduser()
+    if filepath.suffix in ('.mid', '.midi', '.kar'):
+        return pd.read_csv(
+            BytesIO(subprocess.check_output(["midicsv", filepath])),
+            sep=r"\,\s",
+            engine="python",
+            names=['part', 'tick', 'type', 'tempo', 'pitch', 'velocity', 'time_signature'],
+            encoding="iso-8859-1"
+        )
+    if filepath.suffix in ('.csv'):
+        return pd.read_csv(
+            filepath,
+            sep=r"\,\s",
+            engine="python",
+            names=['part', 'tick', 'type', 'tempo', 'pitch', 'velocity', 'time_signature'],
+            encoding="iso-8859-1"
+        )
+    raise NotImplementedError(
+        f"File with extension {filepath.suffix} not supported, "
+        "use (.mid/.midi/.kar/.csv)"
+    )
+
+
+def get_ticks_per_quarter_note(df: pd.DataFrame) -> int:
+    """
+    For this piece, get the number of ticks per quarter note, stored on the Header-type entry
+    """
+    return int(df.loc[df['type'] == 'Header', 'velocity'].values[0])
+
+
+def get_tempos(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Parse the midicsv-transformed data to pull out and record tempo-type entries
+    """
+    return (
+        df.loc[df['type'] == 'Tempo', ['tick', 'tempo']]
+          .drop_duplicates()
+          .set_index('tick')
+          .loc[:, 'tempo']
+          .astype(int)
+    )
+
+
+def get_notes(df: pd.DataFrame, tempos: pd.Series) -> pd.DataFrame:
+    """
+    Parse the midicsv-transformed data to pull out and record note-type entries
+    """
+    return (
+        df
+        # Get "on" notes
+        .loc[(df['type'] == 'Note_on_c') & (df['velocity'].fillna(0).astype(int) != 0),
+             ['part', 'tick', 'pitch', 'velocity']]
+        .drop_duplicates()
+        # Remove bad data
+        .loc[lambda x: x['pitch'].astype(int) != 0]
+        # Append something with null part/pitch/velocity for each tempo for tempo-filling
+        .append(pd.DataFrame({'tick': tempos.index}))
+        # Sort by tick and attach tempo to each tick (using fillna with ffill method)
+        .sort_values('tick')
+        .assign(tempo=lambda x: x['tick'].map(tempos).fillna(method='ffill'))
+        # Get rid of the tempo-filling placeholders
+        .dropna()
+        # Resort to original order
+        .sort_values(['part', 'tick'])
+        # Cross-join all "off" notes for a part/pitch combo
+        .merge(
+            df.loc[(df['type'] == 'Note_off_c') | (df['velocity'].fillna(1).astype(int) == 0),
+                   ['part', 'tick', 'pitch']],
+            on=['part', 'pitch'],
+            suffixes=('', '_off')
+        )
+        # Make sure the cross-joined dataframe is sorted
+        .sort_values(['part', 'tick', 'tick_off'])
+        # Determine how long is between each tick/tick_off pair
+        .assign(dur_ticks=lambda x: x['tick_off'] - x['tick'])
+        # Remove "off" ticks that occur before tick
+        .loc[lambda x: x['dur_ticks'] > 0]
+        # Only keep the first "off" tick, thus eliminating the cross-join
+        .groupby(['part', 'tick', 'pitch', 'velocity', 'tempo'])
+        .agg(
+            tick_off=('tick_off', 'first'),
+            dur_ticks=('dur_ticks', 'first')
+        )
+        # Get rid of tick_off column, since only dur_ticks is needed
+        .drop(columns=['tick_off'])
+        # Rename ticks to start_ticks
+        .reset_index()
+        .rename(columns={'tick': 'start_ticks'})
+    )
+
+
+def tick_to_time(ticks: pd.Series, tempo: pd.Series, ticks_per_quarter_note: int) -> pd.Series:
+    """
+    Translate ticks to seconds given the ticks_per_quarter_note conversion factor
+    """
+    return ticks * tempo / ticks_per_quarter_note / 1000000
+
+
+def get_times(notes: pd.DataFrame, tempos: pd.Series, ticks_per_quarter_note: int) -> pd.DataFrame:
+    """
+    Parse the midicsv-transformed data to process timestamps
+    """
+    start_times = pd.DataFrame(
+        {
+            'tempo_starts': np.append(
+                0, tick_to_time(np.diff(tempos.index), tempos[:-1].values, ticks_per_quarter_note)
+            ).cumsum(),
+            'tempo_ticks': tempos.index
+        }
+    )
+    return (
+        notes
+        .assign(dur_secs=lambda x: tick_to_time(x['dur_ticks'], x['tempo'], ticks_per_quarter_note))
+        .merge(start_times, left_on='start_ticks', right_on='tempo_ticks', how='outer')
+        .assign(tick_index=lambda x: x['start_ticks'].fillna(x['tempo_ticks']))
+        .sort_values('tick_index')
+        .assign(tempo_starts=lambda x: x['tempo_starts'].fillna(method='ffill'))
+        .assign(tempo_ticks=lambda x: x['tempo_ticks'].fillna(method='ffill'))
+        .assign(
+            start_secs=(
+                lambda x: (
+                    tick_to_time(x['start_ticks'] - x['tempo_ticks'],
+                    x['tempo'],
+                    ticks_per_quarter_note) + x['tempo_starts']
+                )
+            )
+        )
+        .dropna()
+        .drop(columns=['tick_index', 'tempo_starts', 'tempo_ticks'])
+    )
+
+
+def get_note_names(pitch: pd.Series) -> pd.Series:
+    """
+    Parse the pitches to get note names
+    """
+    letters = ["C","C","D","D","E","F","F","G","G","A","A","B"]
+    sharps = [ "","#", "","#", "", "","#", "","#", "","#", ""]
+    return (
+        pd.Series(
+            [
+                letters[x % 12] + str(int(x / 12) - 1) + sharps[x % 12] for x in pitch.astype(int)
+            ],
+            name='fullNoteOctave',
+            index=pitch.index
+        )
+    )
+
+
+
+def process_midicsv(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Process the midicsv-transformed data
+    """
+    ticks_per_quarter_note = get_ticks_per_quarter_note(df)
+    tempos = get_tempos(df)
+    notes = get_notes(df, tempos)
+    times = get_times(notes, tempos, ticks_per_quarter_note)
+    note_names = get_note_names(notes['pitch'])
+    return (
+        times
+        .assign(fullNoteOctave=note_names)
+        .astype({
+            'start_ticks': int,
+            'start_secs': float,
+            'dur_ticks': int,
+            'dur_secs': float,
+            'pitch': int,
+            'fullNoteOctave': str,
+            'velocity': int,
+            'part': int
+        })
+        .loc[:, ['start_ticks', 'start_secs', 'dur_ticks', 'dur_secs',
+                 'pitch', 'fullNoteOctave', 'velocity', 'part']]
+        .sort_values(['start_ticks', 'part'])
+    )
+
+
+if __name__ == "__main__":
+    # Parse command line arguments to get input and output files
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_file", help="input file (raw .mid or midicsv output .csv)")
+    parser.add_argument("output_file", help="output filename (.csv format)")
+    args = parser.parse_args()
+    # Load the midicsv-transformed data as a pandas DataFrame
+    df_input = open_file(args.input_file)
+    # Process the midicsv to get a properly formatted CSV for use by c82.net's Off the Staff
+    df_output = process_midicsv(df_input)
+    # Save the DataFrame to output_file
+    df_output.to_csv(args.output_file, index=False)