diff --git a/.gitignore b/.gitignore index 5fd1337..866c379 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,12 @@ docs/_build/ fpstats fpmstats tmp.fpdb + +# Dev Stuff +venv/ +tests/ +precomppkdir/ +queries/ + +# OS objects +.DS_Store diff --git a/Makefile b/Makefile index 06d3c06..34073d1 100644 --- a/Makefile +++ b/Makefile @@ -16,70 +16,70 @@ test: test_onecore test_onecore_precomp test_onecore_newmerge test_onecore_preco rm -f fpdbase*.pklz test_onecore: fpdbase.pklz - ${AUDFPRINT} match --dbase fpdbase.pklz query.mp3 + ${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3 test_remove: fpdbase.pklz - ${AUDFPRINT} remove --dbase fpdbase.pklz Nine_Lives/05-Full_Circle.mp3 Nine_Lives/01-Nine_Lives.mp3 + ${AUDFPRINT} remove --dbase fpdbase.pklz tests/data/Nine_Lives/05-Full_Circle.mp3 tests/data/Nine_Lives/01-Nine_Lives.mp3 ${AUDFPRINT} list --dbase fpdbase.pklz - ${AUDFPRINT} add --dbase fpdbase.pklz Nine_Lives/01-Nine_Lives.mp3 Nine_Lives/05-Full_Circle.mp3 + ${AUDFPRINT} add --dbase fpdbase.pklz tests/data/Nine_Lives/01-Nine_Lives.mp3 tests/data/Nine_Lives/05-Full_Circle.mp3 ${AUDFPRINT} list --dbase fpdbase.pklz - ${AUDFPRINT} match --dbase fpdbase.pklz query.mp3 + ${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3 fpdbase.pklz: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py - ${AUDFPRINT} new --dbase fpdbase.pklz Nine_Lives/0*.mp3 - ${AUDFPRINT} add --dbase fpdbase.pklz Nine_Lives/1*.mp3 + ${AUDFPRINT} new --dbase fpdbase.pklz tests/data/Nine_Lives/0*.mp3 + ${AUDFPRINT} add --dbase fpdbase.pklz tests/data/Nine_Lives/1*.mp3 test_onecore_precomp: precompdir - ${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/Nine_Lives/0* - ${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/Nine_Lives/1* + ${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/tests/data/Nine_Lives/0* + ${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/tests/data/Nine_Lives/1* ${AUDFPRINT} merge --dbase fpdbase1.pklz fpdbase0.pklz - ${AUDFPRINT} match --dbase fpdbase1.pklz precompdir/query.afpt + ${AUDFPRINT} match --dbase fpdbase1.pklz precompdir/tests/data/query.afpt test_onecore_newmerge: precompdir - ${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/Nine_Lives/0* - ${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/Nine_Lives/1* + ${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/tests/data/Nine_Lives/0* + ${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/tests/data/Nine_Lives/1* rm -f fpdbase2.pklz ${AUDFPRINT} newmerge --dbase fpdbase2.pklz fpdbase0.pklz fpdbase1.pklz - ${AUDFPRINT} match --dbase fpdbase2.pklz precompdir/query.afpt + ${AUDFPRINT} match --dbase fpdbase2.pklz precompdir/tests/data/query.afpt precompdir: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py rm -rf precompdir mkdir precompdir - ${AUDFPRINT} precompute --precompdir precompdir Nine_Lives/*.mp3 - ${AUDFPRINT} precompute --precompdir precompdir --shifts 4 query.mp3 + ${AUDFPRINT} precompute --precompdir precompdir tests/data/Nine_Lives/*.mp3 + ${AUDFPRINT} precompute --precompdir precompdir --shifts 4 tests/data/query.mp3 test_onecore_precomppk: precomppkdir - ${AUDFPRINT} new --dbase fpdbase0.pklz precomppkdir/Nine_Lives/0* - ${AUDFPRINT} new --dbase fpdbase1.pklz precomppkdir/Nine_Lives/1* + ${AUDFPRINT} new --dbase fpdbase0.pklz precomppkdir/tests/data/Nine_Lives/0* + ${AUDFPRINT} new --dbase fpdbase1.pklz precomppkdir/tests/data/Nine_Lives/1* ${AUDFPRINT} merge --dbase fpdbase1.pklz fpdbase0.pklz - ${AUDFPRINT} match --dbase fpdbase1.pklz precomppkdir/query.afpk + ${AUDFPRINT} match --dbase fpdbase1.pklz precomppkdir/tests/data/query.afpk rm -rf precomppkdir precomppkdir: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py rm -rf precomppkdir mkdir precomppkdir - ${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir Nine_Lives/*.mp3 - ${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir --shifts 4 query.mp3 + ${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir tests/data/Nine_Lives/*.mp3 + ${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir --shifts 4 tests/data/query.mp3 test_mucore: fpdbase_mu.pklz - ${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 query.mp3 + ${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 tests/data/query.mp3 fpdbase_mu.pklz: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py - ${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 Nine_Lives/0*.mp3 - ${AUDFPRINT} add --dbase fpdbase_mu.pklz --ncores 4 Nine_Lives/1*.mp3 + ${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 tests/data/Nine_Lives/0*.mp3 + ${AUDFPRINT} add --dbase fpdbase_mu.pklz --ncores 4 tests/data/Nine_Lives/1*.mp3 test_mucore_precomp: precompdir_mu - ${AUDFPRINT} new --dbase fpdbase_mu0.pklz --ncores 4 precompdir_mu/Nine_Lives/0* - ${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/Nine_Lives/1* + ${AUDFPRINT} new --dbase fpdbase_mu0.pklz --ncores 4 precompdir_mu/tests/data/Nine_Lives/0* + ${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/tests/data/Nine_Lives/1* ${AUDFPRINT} merge --dbase fpdbase_mu.pklz fpdbase_mu0.pklz - ${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt + ${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py rm -rf precompdir_mu mkdir precompdir_mu - ${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu Nine_Lives/*.mp3 - ${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu --shifts 4 query.mp3 query.mp3 query.mp3 query.mp3 query.mp3 query.mp3 + ${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu tests/data/Nine_Lives/*.mp3 + ${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu --shifts 4 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 -test_hash_mask: - ${AUDFPRINT} new --dbase fpdbase.pklz --hashbits 16 Nine_Lives/*.mp3 - ${AUDFPRINT} match --dbase fpdbase.pklz query.mp3 +test_hash_mask: + ${AUDFPRINT} new --dbase fpdbase.pklz --hashbits 16 tests/data/Nine_Lives/*.mp3 + ${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3 diff --git a/README.md b/README.md index 46c7b6a..a07f1bc 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ Options: -v , --verbose Verbosity level [default: 1] -I, --illustrate Make a plot showing the match -J, --illustrate-hpf Plot the match, using onset enhancement + -O, --json Return json object instead of string -W , --wavdir Find sound files under this dir [default: ] -V , --wavext Extension to add to wav file names [default: ] --version Report version number @@ -145,5 +146,3 @@ Scaling The fingerprint database records 2^20 (~1M) distinct fingerprints, with (by default) 100 entries for each fingerprint bucket. When the bucket fills, track entries are dropped at random; since matching depends only on making a minimum number of matches, but no particular match, dropping some of the more popular ones does not prevent matching. The Matlab version has been successfully used for databases of 100k+ tracks. Reducing the hash density (`--density`) leads to smaller reference database size, and the capacity to record more reference items before buckets begin to fill; a density of 7.0 works well. Times (in units of 256 samples, i.e., 23 ms at the default 11kHz sampling rate) are stored in the bottom 14 bits of each database entry, meaning that times larger than 2^14*0.023 = 380 sec, or about 6 mins, are aliased. If you want to correctly identify time offsets in tracks longer than this, you need to use a larger `--maxtimebits`; e.g. `--maxtimebits 16` increases the time range to 65,536 frames, or about 25 minutes at 11 kHz. The trade-off is that the remaining bits in each 32 bit entry (i.e., 18 bits for the default 14 bit times) are used to store the track ID. Thus, by default, the database can only remember 2^18 = 262k tracks; using a larger `--maxtimebits` will reduce this; similarly, you can increase the number of distinct tracks by reducing `--maxtimebits`, which doesn't prevent matching tracks, but progressively reduces discrimination as the number of distinct time slots reduces (and can make the reported time offsets, and time ranges for `--find-time-ranges`, completely wrong for longer tracks). - - diff --git a/audfprint.py b/audfprint.py index c33cd37..829216d 100755 --- a/audfprint.py +++ b/audfprint.py @@ -17,6 +17,7 @@ import docopt # For command line interface import joblib # for match +import json import audfprint_analyze # The actual analyzer class/code import audfprint_match # Access to match functions, used in command line interface @@ -35,12 +36,16 @@ def filename_list_iterator(filelist, wavdir, wavext, listflag): """ Iterator to yeild all the filenames, possibly interpreting them as list files, prepending wavdir """ if not listflag: + # print(filelist) for filename in filelist: - yield os.path.join(wavdir, filename + wavext) + # print('listflag', os.path.join(wavdir, filename + wavext)) + yield filename#os.path.join(wavdir, filename + wavext) else: + # print(filelist) for listfilename in filelist: with open(listfilename, 'r') as f: for filename in f: + # print('|', os.path.join(wavdir, filename.rstrip('\n') + wavext)) yield os.path.join(wavdir, filename.rstrip('\n') + wavext) @@ -130,7 +135,7 @@ def make_ht_from_list(analyzer, filelist, hashbits, depth, maxtime, pipe=None): # Add in the files for filename in filelist: hashes = analyzer.wavfile2hashes(filename) - ht.store(filename, hashes) + ht.store(filename, hashes, analyzer.density) # Pass back to caller if pipe: pipe.send(ht) @@ -161,17 +166,17 @@ def do_cmd(cmd, analyzer, hash_tab, filename_iter, matcher, outdir, type, report elif cmd == 'match': # Running query, single-core mode for num, filename in enumerate(filename_iter): - msgs = matcher.file_match_to_msgs(analyzer, hash_tab, filename, num) - report(msgs) + results = matcher_file_match(matcher, analyzer, hash_tab, filename, num) + report(results, True) elif cmd == 'new' or cmd == 'add': # Adding files tothashes = 0 ix = 0 for filename in filename_iter: - report([time.ctime() + " ingesting #" + str(ix) + ": " - + filename + " ..."]) dur, nhash = analyzer.ingest(hash_tab, filename) + # report([time.ctime() + " ingesting #" + str(ix) +" : "+ filename + " "+ str(hash_table.track_duration(filename))+"s ..."+str(nhash/dur)+"hashes/s"]) + report(["ingesting # {} : track: {}, duration[sec]: {}, density[hashes/sec]: {} ".format(str(ix), filename, str(hash_table.track_duration(filename)), str(nhash//dur))]) tothashes += nhash ix += 1 @@ -229,9 +234,11 @@ def multiproc_add(analyzer, hash_tab, filename_iter, report, ncores): pr[core].join() -def matcher_file_match_to_msgs(matcher, analyzer, hash_tab, filename): +def matcher_file_match(matcher, analyzer, hash_tab, filename, num=None): """Cover for matcher.file_match_to_msgs so it can be passed to joblib""" - return matcher.file_match_to_msgs(analyzer, hash_tab, filename) + if matcher.json: # add for json parsing + return matcher.file_match_to_objs(analyzer, hash_tab, filename, num) + return matcher.file_match_to_msgs(analyzer, hash_tab, filename, num) def do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher, @@ -253,12 +260,12 @@ def do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher, msgslist = joblib.Parallel(n_jobs=ncores)( # Would use matcher.file_match_to_msgs(), but you # can't use joblib on an instance method - joblib.delayed(matcher_file_match_to_msgs)(matcher, analyzer, - hash_tab, filename) + joblib.delayed(matcher_file_match)(matcher, analyzer, + hash_tab, filename) for filename in filename_iter ) for msgs in msgslist: - report(msgs) + report(msgs, True) elif cmd == 'new' or cmd == 'add': # We add by forking multiple parallel threads each running @@ -288,7 +295,7 @@ def setup_analyzer(args): # set default value for shifts depending on mode if analyzer.shifts == 0: # Default shift is 4 for match, otherwise 1 - analyzer.shifts = 4 if args['match'] else 1 + analyzer.shifts = 4 if args.get('match') else 1 analyzer.fail_on_error = not args['--continue-on-error'] return analyzer @@ -305,6 +312,7 @@ def setup_matcher(args): matcher.exact_count = args['--exact-count'] | args['--illustrate'] | args['--illustrate-hpf'] matcher.illustrate = args['--illustrate'] | args['--illustrate-hpf'] matcher.illustrate_hpf = args['--illustrate-hpf'] + matcher.json = args.get('--json') matcher.verbose = args['--verbose'] matcher.find_time_range = args['--find-time-range'] matcher.time_quantile = float(args['--time-quantile']) @@ -316,12 +324,23 @@ def setup_reporter(args): """ Creates a logging function, either to stderr or file""" opfile = args['--opfile'] if opfile and len(opfile): - f = open(opfile, "w") - - def report(msglist): - """Log messages to a particular output file""" - for msg in msglist: - f.write(msg + "\n") + if args['--json']: # if args is JSON file, we need to serialize for each arg set + if os.path.exists(opfile): # following spirit of text-output, clobber existing file + os.unlink(opfile) + def report(msglist, is_data=False): + """Log messages to a particular output file""" + # right now, if it's a string, dump to console, otherwise we assume + # it's a friendly object and pass through to JSON append function + if not is_data: + print(msglist) + else: + audfprint_match.json_append_objs(opfile, msglist) + else: + f = open(opfile, "w") + def report(msglist, is_data=False): + """Log messages to a particular output file""" + for msg in msglist: + f.write(msg + "\n") else: def report(msglist): """Log messages by printing to stdout""" @@ -374,13 +393,14 @@ def report(msglist): -v , --verbose Verbosity level [default: 1] -I, --illustrate Make a plot showing the match -J, --illustrate-hpf Plot the match, using onset enhancement + -O, --json Return json object instead of string -W , --wavdir Find sound files under this dir [default: ] -V , --wavext Extension to add to wav file names [default: ] --version Report version number --help Print this message """ -__version__ = 20150406 +__version__ = 20190116 def main(argv): @@ -480,8 +500,8 @@ def main(argv): skip_existing=args['--skip-existing'], strip_prefix=args['--wavdir']) - elapsedtime = time_clock() - initticks - if analyzer and analyzer.soundfiletotaldur > 0.: + elapsedtime = time.clock() - initticks + if analyzer and analyzer.soundfiletotaldur > 0. and not args['--verbose']: print("Processed " + "%d files (%.1f s total dur) in %.1f s sec = %.3f x RT" \ % (analyzer.soundfilecount, analyzer.soundfiletotaldur, diff --git a/audfprint_analyze.py b/audfprint_analyze.py index 78189c0..3dda3df 100644 --- a/audfprint_analyze.py +++ b/audfprint_analyze.py @@ -263,6 +263,7 @@ def find_peaks(self, d, sr): n_hop/sr secs), second is the FFT bin (in units of sr/n_fft Hz). """ + if len(d) == 0: return [] @@ -340,7 +341,9 @@ def wavfile2peaks(self, filename, shifts=None): list of (time, bin) pairs. If specified, resample to sr first. shifts > 1 causes hashes to be extracted from multiple shifts of waveform, to reduce frame effects. """ - ext = os.path.splitext(filename)[1] + + _, ext = os.path.splitext(filename) + if ext == PRECOMPPKEXT: # short-circuit - precomputed fingerprint file peaks = peaks_load(filename) @@ -414,8 +417,6 @@ def wavfile2hashes(self, filename): ]).astype(np.int32) hashes = unique_hashes # Or simply np.unique(query_hashes, axis=0) for numpy >= 1.13 - - # print("wavfile2hashes: read", len(hashes), "hashes from", filename) return hashes # ########## functions to link to actual hash table index database ###### # @@ -443,7 +444,7 @@ def ingest(self, hashtable, filename): # n_fft=n_fft, # n_hop=n_hop))) hashes = self.wavfile2hashes(filename) - hashtable.store(filename, hashes) + hashtable.store(filename, hashes, self.density) # return (len(d)/float(sr), len(hashes)) # return (np.max(hashes, axis=0)[0]*n_hop/float(sr), len(hashes)) # soundfiledur is set up in wavfile2hashes, use result here @@ -562,7 +563,7 @@ def glob2hashtable(pattern, density=20.0): totdur = 0.0 tothashes = 0 for ix, file_ in enumerate(filelist): - print(time.ctime(), "ingesting #", ix, ":", file_, "...") + print(time.ctime(), "ingesting #", ix, ":", file_, track_duration(ix), ht.densityperid[ix], "...") dur, nhash = g2h_analyzer.ingest(ht, file_) totdur += dur tothashes += nhash diff --git a/audfprint_match.py b/audfprint_match.py index 64da667..dedda4d 100644 --- a/audfprint_match.py +++ b/audfprint_match.py @@ -5,6 +5,36 @@ Fingerprint matching code for audfprint 2014-05-26 Dan Ellis dpwe@ee.columbia.edu + + +Output: + + Track - matched track from the datastore + match_length - returns how many query seconds matched the resulting track + match_start - returns time position where resulting track started to match in the query + track_start - returns time position where the query started to match in the resulting track + track_coverage - returns a value between [0, 1], informing how much the query covered the resulting track (i.e. a 2 minutes query found a 30 seconds track within it, starting at 100th second, track_coverage will be equal to (120 - 100)/30 ~= 0.66) + match_confidence - returns a value between [0, 1]. A value below 0.15 is most probably a false positive. A value bigger than 0.15 is very likely to be an exact match. For good audio quality queries you can expect getting a match_confidence > 0.5. + + Stats contains useful statistics information for fine-tuning the algorithm: + query_duration - time in milliseconds spend just querying the fingerprints datasource. + match_time - time in milliseconds spent generating the acousting fingerprints from the media file. + match_rank - total # of tracks analyzed during query time. If this number exceeds 50, try optimizing your configuration. + match_fingerprints - total # of fingerprints analyzed during query time. If this number exceeds 500, try optimizing your configuration. + query_fingerprints - total # of fingerprints for the query file + + o = {} + o['track'] = '' + o['match_length'] = 0.0 + o['match_start'] = 0.0 + o['track_start'] = 0.0 + o['track_coverage'] = 0.0 + o['match_confidence'] = 0.0 + o['query_duration'] = 0.0 + o['match_time'] = 0.0 + o['match_rank'] = 0.0 + o['match_fingerprints'] = 0.0 + """ from __future__ import division, print_function import os @@ -25,6 +55,7 @@ import audio_read import stft +import json def process_info(): rss = usrtime = 0 p = psutil.Process(os.getpid()) @@ -89,6 +120,35 @@ def find_modes(data, threshold=5, window=0): return localmaxes + datamin, fullvector[localmaxes] +def json_append_objs(filename, msg_objs): + """ Attempt to append several message objects in JSON format. First, + grab a swap file to indicate a write-lock. Second, prepend existing + data. Third, rename/move the file to its original form. + """ + num_loop = 0 + opfile_swap = filename + ".swp" + while os.path.exists(opfile_swap): + if num_loop > 5: + raise IOError("Failed to retrieve JSON serialize lock for '{}' after {} attempts".format(opfile_swap, num_loop)) + num_loop += 1 + time.sleep(5) + + with open(opfile_swap, 'w') as outfile: # open swap file immediately + if os.path.exists(filename): # if existing resources, load first, prepend + with open(filename, 'r') as infile: + try: + json_objs = json.load(infile) + except json.decoder.JSONDecodeError as e: + outfile.close() + os.unlink(opfile_swap) + raise IOError("Failed to parse JSON file '{}' during output append".format(filename)) + msg_objs = json_objs + msg_objs + json.dump(msg_objs, outfile, indent=4) # dump to file handle, pretty printing + if os.path.exists(filename): # only remove when new swap done + os.unlink(filename) + os.rename(opfile_swap, filename) # finally, rename swap to real resource + + class Matcher(object): """Provide matching for audfprint fingerprint queries to hash table""" @@ -120,6 +180,9 @@ def __init__(self): # alignments, stop looking after a while. self.max_alignments_per_id = 100 + + self.match_time = 0.0 + def _best_count_ids(self, hits, ht): """ Return the indexes for the ids with the best counts. hits is a matrix as returned by hash_table.get_hits() @@ -319,8 +382,8 @@ def match_hashes(self, ht, hashes, hashesfor=None): """ # find the implicated id, time pairs from hash table # log("nhashes=%d" % np.shape(hashes)[0]) + # print('hashes', hashes) hits = ht.get_hits(hashes) - bestids, rawcounts = self._best_count_ids(hits, ht) # log("len(rawcounts)=%d max(rawcounts)=%d" % @@ -330,6 +393,7 @@ def match_hashes(self, ht, hashes, hashesfor=None): else: results = self._exact_match_counts(hits, bestids, rawcounts, hashesfor) + # print('results', results) # Sort results by filtered count, descending results = results[(-results[:, 1]).argsort(),] # Where was our best hit in the unfiltered count ranking? @@ -356,7 +420,9 @@ def match_file(self, analyzer, ht, filename, number=None): timeoffs, rawmatchcount), also length of input file in sec, and count of raw query hashes extracted """ + tic = time.clock() q_hashes = analyzer.wavfile2hashes(filename) + self.match_time = time.clock() - tic # Fake durations as largest hash time if len(q_hashes) == 0: durd = 0.0 @@ -367,11 +433,14 @@ def match_file(self, analyzer, ht, filename, number=None): numberstring = "#%d" % number else: numberstring = "" - print(time.ctime(), "Analyzed", numberstring, filename, "of", - ('%.3f' % durd), "s " - "to", len(q_hashes), "hashes") + # print(time.ctime(), "Analyzed", numberstring, filename, "of", + # ('%.3f' % durd), "s " + # "to", len(q_hashes), "hashes") # Run query + tic = time.clock() rslts = self.match_hashes(ht, q_hashes) + self.match_time += time.clock() - tic + # Post filtering if self.sort_by_time: rslts = rslts[(-rslts[:, 2]).argsort(), :] @@ -418,6 +487,77 @@ def file_match_to_msgs(self, analyzer, ht, qry, number=None): self.illustrate_match(analyzer, ht, qry) return msgrslt + + def file_match_to_objs(self, analyzer, ht, qry, number=None): + """ Perform a match on a single input file, return list + of utf-8 json objects + + track: + Name of the Track. + + match_length: + The length in seconds of how long the measure/match has taken within the query. + + match_start: + The time in seconds when the match starts to occur in relation to the query time. + + track_start: + The time in seconds when the match starts to occur in relation to the track time. + + track_coverage: + Percentage of the query duration in relation to the matched track duration. + + match_time: + How long it has take to fingerprint the query. + + query_duration: + The length of the query in seconds. + + match_fingerprints: + The count of all the hits within the database in quest. + + match_rank: + How many tracks have been hit through `match_fingerprints` + + """ + + rslts, dur, nhash = self.match_file(analyzer, ht, qry, number) + t_hop = analyzer.n_hop / analyzer.target_sr + msgrslt = [] + + if len(rslts) == 0: + # No matches returned at all + nhashaligned = 0 + else: + for (tophitid, nhashaligned, aligntime, nhashraw, rank, + min_time, max_time) in rslts: + + # figure the number of raw and aligned matches for all hits + o = {} + o['track'] = ht.names[tophitid] + o['match_length'] = 0.0 + o['match_start'] = 0.0 + o['track_start'] = 0.0 + o['match_time'] = self.match_time # consistent with class + o['query_duration'] = dur + o['query_fingerprints'] = nhash + o['match_rank'] = int(rank) # cast from np.int33 + o['match_fingerprints'] = int(nhashraw) # cast from np.int33 + o['match_confidence'] = nhashaligned/nhashraw + o['track_coverage'] = dur/ht.durationperiod[tophitid] + + if self.find_time_range: + o['match_length'] = (max_time - min_time) * t_hop + o['match_start'] = min_time * t_hop + o['track_start'] = (min_time + aligntime) * t_hop + else: + o['match_start'] = aligntime * t_hop + + if self.illustrate: + self.illustrate_match(analyzer, ht, qry) + msgrslt.append(o) + return msgrslt + def illustrate_match(self, analyzer, ht, filename): """ Show the query fingerprints and the matching ones plotted over a spectrogram """ @@ -466,15 +606,13 @@ def illustrate_match(self, analyzer, ht, filename): # Return return results - def localtest(): """Function to provide quick test""" pat = '/Users/dpwe/projects/shazam/Nine_Lives/*mp3' qry = 'query.mp3' hash_tab = audfprint_analyze.glob2hashtable(pat) matcher = Matcher() - rslts, dur, nhash = matcher.match_file(audfprint_analyze.g2h_analyzer, - hash_tab, qry) + rslts, dur, nhash = matcher.match_file(audfprint_analyze.g2h_analyzer, hash_tab, qry) t_hop = 0.02322 print("Matched", qry, "(", dur, "s,", nhash, "hashes)", "as", hash_tab.names[rslts[0][0]], diff --git a/hash_table.py b/hash_table.py index 1f853c7..8dca6cc 100644 --- a/hash_table.py +++ b/hash_table.py @@ -13,6 +13,7 @@ import math import os import random +import subprocess import sys import numpy as np @@ -36,6 +37,13 @@ # Earliest version that can be updated with load_old HT_OLD_COMPAT_VERSION = 20140920 +def track_duration(filename): + try: + duration = subprocess.check_output(' '.join(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', '\"{}\"'.format(filename)]), shell=True).splitlines()[0].decode('utf-8') + duration = float(duration) + except subprocess.CalledProcessError: + duration = 0 + return duration def _bitsfor(maxval): """ Convert a maxval into a number of bits (left shift). @@ -73,6 +81,8 @@ def __init__(self, filename=None, hashbits=20, depth=100, maxtime=16384): self.names = [] # track number of hashes stored per id self.hashesperid = np.zeros(0, np.uint32) + self.durationperiod = np.zeros(0, np.float32) + self.densityperid = np.zeros(0, np.float32) # Empty params self.params = {} # Record the current version @@ -86,12 +96,16 @@ def reset(self): self.counts[:] = 0 self.names = [] self.hashesperid.resize(0) + self.durationperiod.resize(0) + self.densityperid.resize(0) self.dirty = True - def store(self, name, timehashpairs): + def store(self, name, timehashpairs, density): """ Store a list of hashes in the hash table associated with a particular name (or integer ID) and time. """ + duration = track_duration(name) + # print('store', name, track_duration) id_ = self.name_to_id(name, add_if_missing=True) # Now insert the hashes hashmask = (1 << self.hashbits) - 1 @@ -134,6 +148,9 @@ def store(self, name, timehashpairs): self.counts[hash_] = count + 1 # Record how many hashes we (attempted to) save for this id self.hashesperid[id_] += len(timehashpairs) + # print(track_duration, type(track_duration)) + self.durationperiod[id_] += duration + self.densityperid[id_] += density # Mark as unsaved self.dirty = True @@ -242,6 +259,8 @@ def load_pkl(self, name, file_object=None): self.counts = temp.counts self.names = temp.names self.hashesperid = np.array(temp.hashesperid).astype(np.uint32) + self.durationperiod = np.array(temp.durationperiod).astype(np.float32) + self.densityperid = np.array(temp.densityperid).astype(np.float32) self.dirty = False self.params = params @@ -278,6 +297,8 @@ def load_matlab(self, name): self.names = [str(val[0]) if len(val) > 0 else [] for val in mht['HashTableNames'][0]] self.hashesperid = np.array(mht['HashTableLengths'][0]).astype(np.uint32) + self.durationperiod = np.array(mht['HashTableDurations'][0]).astype(np.float32) + self.densityperid = np.array(mht['HashTableDensities'][0]).astype(np.float32) # Matlab uses 1-origin for the IDs in the hashes, but the Python code # also skips using id_ 0, so that names[0] corresponds to id_ 1. # Otherwise unmodified database @@ -297,6 +318,8 @@ def merge(self, ht): # size = len(self.counts) self.names += ht.names self.hashesperid = np.append(self.hashesperid, ht.hashesperid) + self.durationperiod = np.append(self.durationperiod, ht.durationperiod) + self.densityperid = np.append(self.densityperid, ht.densityperid) # Shift all the IDs in the second table down by ncurrent idoffset = (1 << self.maxtimebits) * ncurrent for hash_ in np.nonzero(ht.counts)[0]: @@ -334,9 +357,13 @@ def name_to_id(self, name, add_if_missing=False): id_ = self.names.index(None) self.names[id_] = name self.hashesperid[id_] = 0 + self.durationperiod[id_] = (name) + self.densityperid[id_] = 0 except ValueError: self.names.append(name) self.hashesperid = np.append(self.hashesperid, [0]) + self.durationperiod = np.append(self.durationperiod, [0]) + self.densityperid = np.append(self.densityperid, [0]) id_ = self.names.index(name) else: # we were passed in a numerical id @@ -360,6 +387,8 @@ def remove(self, name): hashes_removed += np.sum(id_in_table[hash_]) self.names[id_] = None self.hashesperid[id_] = 0 + self.durationperiod[id_] = 0 + self.densityperid[id_] = 0 self.dirty = True print("Removed", name, "(", hashes_removed, "hashes).") @@ -386,6 +415,11 @@ def list(self, print_fn=None): """ List all the known items. """ if not print_fn: print_fn = print - for name, count in zip(self.names, self.hashesperid): + for name, count, duration, density in zip(self.names, self.hashesperid, self.durationperiod, self.densityperid): + # the purpose of this display is to improve fingreptinting parameter in order to compare across multiple batches if name: - print_fn(name + " (" + str(count) + " hashes)") + if duration != 0 and duration > 0: + real_density = str(float(count)/duration) + print_fn("track: \'{}\', hash_count[units]: {}, duration[s]: {}, real_density: {}, fingerprinted_density: {}".format(name, str(count), duration, real_density, density)) + else: + print_fn("track: \'{}\', hash_count[units]: {}, duration[s]: {}, fingerprinted_density: {}".format(name, str(count), duration, density)) diff --git a/requirements.sh b/requirements.sh new file mode 100644 index 0000000..6689d03 --- /dev/null +++ b/requirements.sh @@ -0,0 +1,4 @@ +build-essential +python3.6 +python3-pip +ffmpeg \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d605c78..1b9bb91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ numpy scipy docopt joblib +audioread +matplotlib psutil diff --git a/tests/profile_audfprint.py b/tests/profile_audfprint.py index 8871f54..50d2db1 100644 --- a/tests/profile_audfprint.py +++ b/tests/profile_audfprint.py @@ -1,11 +1,15 @@ # coding=utf-8 import cProfile import pstats +import os, sys # for local testing + +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath('..')) # noinspection PyUnresolvedReferences import audfprint -argv = ["audfprint", "new", "-d", "tmp.fpdb", "--density", "200", +argv = ["audfprint", "new", "-d", "fpdbase.pklz", "--density", "200", "data/Nine_Lives/01-Nine_Lives.mp3", "data/Nine_Lives/02-Falling_In_Love.mp3", "data/Nine_Lives/03-Hole_In_My_Soul.mp3", @@ -26,3 +30,4 @@ p.sort_stats('time') p.print_stats(10) + diff --git a/tests/profile_audfprint_match.py b/tests/profile_audfprint_match.py index fb859b0..a4e6bd1 100644 --- a/tests/profile_audfprint_match.py +++ b/tests/profile_audfprint_match.py @@ -1,15 +1,38 @@ # coding=utf-8 import cProfile import pstats +import os, sys # for local testing + +if __name__ == "__main__": + sys.path.insert(0, os.path.abspath('..')) # noinspection PyUnresolvedReferences import audfprint -argv = ["audfprint", "match", "-d", "fpdbase.pklz", "--density", "200", "query.mp3"] +def run_test(): + cProfile.run('audfprint.main(argv)', 'fpmstats') + + p = pstats.Stats('fpmstats') + + p.sort_stats('time') + p.print_stats(10) + +# run test with text file output +print("=== Match to text file output ==============================================") +argv = ["audfprint", "match", "-d", "fpdbase.pklz", "--density", "200", "--opfile", "match.output.txt", "data/query.mp3"] +run_test() -cProfile.run('audfprint.main(argv)', 'fpmstats') +# run test with JSON file output; also turn down verbosity to not output +print("=== Match to JSON file output ==============================================") +argv = ["audfprint", "match", "-d", "fpdbase.pklz", "--density", "200", "--json", "--opfile", "match.output.json", "--verbose", 0, "data/query.mp3"] +run_test() -p = pstats.Stats('fpmstats') +# run test with JSON file output; also turn down verbosity to not output +print("=== Match to JSON file output (with time range search) =====================") +argv = ["audfprint", "match", "-d", "fpdbase.pklz", "--find-time-range", "--density", "200", "--json", "--opfile", "match.output_aligned.json", "--verbose", 0, "data/query.mp3"] +run_test() -p.sort_stats('time') -p.print_stats(10) +# run test with text file output +print("=== Match to text file output (exact match determintion and precise match) =") +argv = ["audfprint", "match", "-d", "fpdbase.pklz", "--find-time-range", "--exact-count", "--density", "200", "--json", "--opfile", "match.output_exact.json", "--verbose", 0, "data/query.mp3"] +run_test()