Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
11cac3e
requirements.txt and Makefile enhanced:
vriez Sep 13, 2018
c2d9ddf
JSON object being returned instead of a string.
vriez Oct 10, 2018
02bb61c
objects being displayed in compliance with the --find-time-range option
vriez Oct 15, 2018
08547f8
checking results backwards
vriez Oct 16, 2018
a2273a5
displaying all retrieved track hashes.
vriez Oct 17, 2018
0142aee
a bit more grepeable output for the list option.
vriez Oct 17, 2018
3a8c62a
Now storing the ’fingeprinted_density’, the density with which the tr…
vriez Oct 17, 2018
dc842ec
a bit more grepeable output for the new option.
vriez Oct 17, 2018
55ae5d3
Improved the displayed messages within `list` and `new` commands, as …
vriez Oct 29, 2018
416c8ba
--verbose option needs to be double-checked, it displays additional i…
vriez Oct 29, 2018
48991f9
Added the minimal Unix tools in order to run the app. Also, some slig…
vriez Oct 30, 2018
b8b1262
Some suggestions on `DOWNLOAD AND INSTALLATION`
vriez Oct 30, 2018
b8282fc
Some improvements made on the Object descriptors explanations.
vriez Oct 30, 2018
bf3dccc
-O added onto README.md
vriez Oct 30, 2018
a91497d
fix around --verbose.
vriez Nov 5, 2018
8283d72
no longer saving query matches into a .csv file since they are not st…
vriez Nov 5, 2018
e979527
Merge branch 'master' into test_objects_returns
Jan 16, 2019
37febe8
update JSON writing as list to file
Jan 16, 2019
26b75c0
update profile script for JSON output compare
Jan 16, 2019
85f3ea5
following text-based output, clober existing JSON
Jan 16, 2019
c59348f
restore verbosity to JSON mode output
Jan 27, 2019
2cbd31b
modify name of returned JSON attributes
Jan 27, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,12 @@ docs/_build/
fpstats
fpmstats
tmp.fpdb

# Dev Stuff
venv/
tests/
precomppkdir/
queries/

# OS objects
.DS_Store
60 changes: 30 additions & 30 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,70 +16,70 @@ test: test_onecore test_onecore_precomp test_onecore_newmerge test_onecore_preco
rm -f fpdbase*.pklz

test_onecore: fpdbase.pklz
${AUDFPRINT} match --dbase fpdbase.pklz query.mp3
${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3

test_remove: fpdbase.pklz
${AUDFPRINT} remove --dbase fpdbase.pklz Nine_Lives/05-Full_Circle.mp3 Nine_Lives/01-Nine_Lives.mp3
${AUDFPRINT} remove --dbase fpdbase.pklz tests/data/Nine_Lives/05-Full_Circle.mp3 tests/data/Nine_Lives/01-Nine_Lives.mp3
${AUDFPRINT} list --dbase fpdbase.pklz
${AUDFPRINT} add --dbase fpdbase.pklz Nine_Lives/01-Nine_Lives.mp3 Nine_Lives/05-Full_Circle.mp3
${AUDFPRINT} add --dbase fpdbase.pklz tests/data/Nine_Lives/01-Nine_Lives.mp3 tests/data/Nine_Lives/05-Full_Circle.mp3
${AUDFPRINT} list --dbase fpdbase.pklz
${AUDFPRINT} match --dbase fpdbase.pklz query.mp3
${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3

fpdbase.pklz: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
${AUDFPRINT} new --dbase fpdbase.pklz Nine_Lives/0*.mp3
${AUDFPRINT} add --dbase fpdbase.pklz Nine_Lives/1*.mp3
${AUDFPRINT} new --dbase fpdbase.pklz tests/data/Nine_Lives/0*.mp3
${AUDFPRINT} add --dbase fpdbase.pklz tests/data/Nine_Lives/1*.mp3

test_onecore_precomp: precompdir
${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/Nine_Lives/1*
${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/tests/data/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/tests/data/Nine_Lives/1*
${AUDFPRINT} merge --dbase fpdbase1.pklz fpdbase0.pklz
${AUDFPRINT} match --dbase fpdbase1.pklz precompdir/query.afpt
${AUDFPRINT} match --dbase fpdbase1.pklz precompdir/tests/data/query.afpt

test_onecore_newmerge: precompdir
${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/Nine_Lives/1*
${AUDFPRINT} new --dbase fpdbase0.pklz precompdir/tests/data/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase1.pklz precompdir/tests/data/Nine_Lives/1*
rm -f fpdbase2.pklz
${AUDFPRINT} newmerge --dbase fpdbase2.pklz fpdbase0.pklz fpdbase1.pklz
${AUDFPRINT} match --dbase fpdbase2.pklz precompdir/query.afpt
${AUDFPRINT} match --dbase fpdbase2.pklz precompdir/tests/data/query.afpt

precompdir: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
rm -rf precompdir
mkdir precompdir
${AUDFPRINT} precompute --precompdir precompdir Nine_Lives/*.mp3
${AUDFPRINT} precompute --precompdir precompdir --shifts 4 query.mp3
${AUDFPRINT} precompute --precompdir precompdir tests/data/Nine_Lives/*.mp3
${AUDFPRINT} precompute --precompdir precompdir --shifts 4 tests/data/query.mp3

test_onecore_precomppk: precomppkdir
${AUDFPRINT} new --dbase fpdbase0.pklz precomppkdir/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase1.pklz precomppkdir/Nine_Lives/1*
${AUDFPRINT} new --dbase fpdbase0.pklz precomppkdir/tests/data/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase1.pklz precomppkdir/tests/data/Nine_Lives/1*
${AUDFPRINT} merge --dbase fpdbase1.pklz fpdbase0.pklz
${AUDFPRINT} match --dbase fpdbase1.pklz precomppkdir/query.afpk
${AUDFPRINT} match --dbase fpdbase1.pklz precomppkdir/tests/data/query.afpk
rm -rf precomppkdir

precomppkdir: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
rm -rf precomppkdir
mkdir precomppkdir
${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir Nine_Lives/*.mp3
${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir --shifts 4 query.mp3
${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir tests/data/Nine_Lives/*.mp3
${AUDFPRINT} precompute --precompute-peaks --precompdir precomppkdir --shifts 4 tests/data/query.mp3

test_mucore: fpdbase_mu.pklz
${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 query.mp3
${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 tests/data/query.mp3

fpdbase_mu.pklz: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 Nine_Lives/0*.mp3
${AUDFPRINT} add --dbase fpdbase_mu.pklz --ncores 4 Nine_Lives/1*.mp3
${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 tests/data/Nine_Lives/0*.mp3
${AUDFPRINT} add --dbase fpdbase_mu.pklz --ncores 4 tests/data/Nine_Lives/1*.mp3

test_mucore_precomp: precompdir_mu
${AUDFPRINT} new --dbase fpdbase_mu0.pklz --ncores 4 precompdir_mu/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/Nine_Lives/1*
${AUDFPRINT} new --dbase fpdbase_mu0.pklz --ncores 4 precompdir_mu/tests/data/Nine_Lives/0*
${AUDFPRINT} new --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/tests/data/Nine_Lives/1*
${AUDFPRINT} merge --dbase fpdbase_mu.pklz fpdbase_mu0.pklz
${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt precompdir_mu/query.afpt
${AUDFPRINT} match --dbase fpdbase_mu.pklz --ncores 4 precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt precompdir_mu/tests/data/query.afpt

precompdir_mu: audfprint.py audfprint_analyze.py audfprint_match.py hash_table.py
rm -rf precompdir_mu
mkdir precompdir_mu
${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu Nine_Lives/*.mp3
${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu --shifts 4 query.mp3 query.mp3 query.mp3 query.mp3 query.mp3 query.mp3
${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu tests/data/Nine_Lives/*.mp3
${AUDFPRINT} precompute --ncores 4 --precompdir precompdir_mu --shifts 4 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3 tests/data/query.mp3

test_hash_mask:
${AUDFPRINT} new --dbase fpdbase.pklz --hashbits 16 Nine_Lives/*.mp3
${AUDFPRINT} match --dbase fpdbase.pklz query.mp3
test_hash_mask:
${AUDFPRINT} new --dbase fpdbase.pklz --hashbits 16 tests/data/Nine_Lives/*.mp3
${AUDFPRINT} match --dbase fpdbase.pklz tests/data/query.mp3
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Options:
-v <val>, --verbose <val> Verbosity level [default: 1]
-I, --illustrate Make a plot showing the match
-J, --illustrate-hpf Plot the match, using onset enhancement
-O, --json Return json object instead of string
-W <dir>, --wavdir <dir> Find sound files under this dir [default: ]
-V <ext>, --wavext <ext> Extension to add to wav file names [default: ]
--version Report version number
Expand Down Expand Up @@ -145,5 +146,3 @@ Scaling
The fingerprint database records 2^20 (~1M) distinct fingerprints, with (by default) 100 entries for each fingerprint bucket. When the bucket fills, track entries are dropped at random; since matching depends only on making a minimum number of matches, but no particular match, dropping some of the more popular ones does not prevent matching. The Matlab version has been successfully used for databases of 100k+ tracks. Reducing the hash density (`--density`) leads to smaller reference database size, and the capacity to record more reference items before buckets begin to fill; a density of 7.0 works well.

Times (in units of 256 samples, i.e., 23 ms at the default 11kHz sampling rate) are stored in the bottom 14 bits of each database entry, meaning that times larger than 2^14*0.023 = 380 sec, or about 6 mins, are aliased. If you want to correctly identify time offsets in tracks longer than this, you need to use a larger `--maxtimebits`; e.g. `--maxtimebits 16` increases the time range to 65,536 frames, or about 25 minutes at 11 kHz. The trade-off is that the remaining bits in each 32 bit entry (i.e., 18 bits for the default 14 bit times) are used to store the track ID. Thus, by default, the database can only remember 2^18 = 262k tracks; using a larger `--maxtimebits` will reduce this; similarly, you can increase the number of distinct tracks by reducing `--maxtimebits`, which doesn't prevent matching tracks, but progressively reduces discrimination as the number of distinct time slots reduces (and can make the reported time offsets, and time ranges for `--find-time-ranges`, completely wrong for longer tracks).


62 changes: 41 additions & 21 deletions audfprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import docopt # For command line interface
import joblib # for match
import json

import audfprint_analyze # The actual analyzer class/code
import audfprint_match # Access to match functions, used in command line interface
Expand All @@ -35,12 +36,16 @@ def filename_list_iterator(filelist, wavdir, wavext, listflag):
""" Iterator to yeild all the filenames, possibly interpreting them
as list files, prepending wavdir """
if not listflag:
# print(filelist)
for filename in filelist:
yield os.path.join(wavdir, filename + wavext)
# print('listflag', os.path.join(wavdir, filename + wavext))
yield filename#os.path.join(wavdir, filename + wavext)
else:
# print(filelist)
for listfilename in filelist:
with open(listfilename, 'r') as f:
for filename in f:
# print('|', os.path.join(wavdir, filename.rstrip('\n') + wavext))
yield os.path.join(wavdir, filename.rstrip('\n') + wavext)


Expand Down Expand Up @@ -130,7 +135,7 @@ def make_ht_from_list(analyzer, filelist, hashbits, depth, maxtime, pipe=None):
# Add in the files
for filename in filelist:
hashes = analyzer.wavfile2hashes(filename)
ht.store(filename, hashes)
ht.store(filename, hashes, analyzer.density)
# Pass back to caller
if pipe:
pipe.send(ht)
Expand Down Expand Up @@ -161,17 +166,17 @@ def do_cmd(cmd, analyzer, hash_tab, filename_iter, matcher, outdir, type, report
elif cmd == 'match':
# Running query, single-core mode
for num, filename in enumerate(filename_iter):
msgs = matcher.file_match_to_msgs(analyzer, hash_tab, filename, num)
report(msgs)
results = matcher_file_match(matcher, analyzer, hash_tab, filename, num)
report(results, True)

elif cmd == 'new' or cmd == 'add':
# Adding files
tothashes = 0
ix = 0
for filename in filename_iter:
report([time.ctime() + " ingesting #" + str(ix) + ": "
+ filename + " ..."])
dur, nhash = analyzer.ingest(hash_tab, filename)
# report([time.ctime() + " ingesting #" + str(ix) +" : "+ filename + " "+ str(hash_table.track_duration(filename))+"s ..."+str(nhash/dur)+"hashes/s"])
report(["ingesting # {} : track: {}, duration[sec]: {}, density[hashes/sec]: {} ".format(str(ix), filename, str(hash_table.track_duration(filename)), str(nhash//dur))])
tothashes += nhash
ix += 1

Expand Down Expand Up @@ -229,9 +234,11 @@ def multiproc_add(analyzer, hash_tab, filename_iter, report, ncores):
pr[core].join()


def matcher_file_match_to_msgs(matcher, analyzer, hash_tab, filename):
def matcher_file_match(matcher, analyzer, hash_tab, filename, num=None):
"""Cover for matcher.file_match_to_msgs so it can be passed to joblib"""
return matcher.file_match_to_msgs(analyzer, hash_tab, filename)
if matcher.json: # add for json parsing
return matcher.file_match_to_objs(analyzer, hash_tab, filename, num)
return matcher.file_match_to_msgs(analyzer, hash_tab, filename, num)


def do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher,
Expand All @@ -253,12 +260,12 @@ def do_cmd_multiproc(cmd, analyzer, hash_tab, filename_iter, matcher,
msgslist = joblib.Parallel(n_jobs=ncores)(
# Would use matcher.file_match_to_msgs(), but you
# can't use joblib on an instance method
joblib.delayed(matcher_file_match_to_msgs)(matcher, analyzer,
hash_tab, filename)
joblib.delayed(matcher_file_match)(matcher, analyzer,
hash_tab, filename)
for filename in filename_iter
)
for msgs in msgslist:
report(msgs)
report(msgs, True)

elif cmd == 'new' or cmd == 'add':
# We add by forking multiple parallel threads each running
Expand Down Expand Up @@ -288,7 +295,7 @@ def setup_analyzer(args):
# set default value for shifts depending on mode
if analyzer.shifts == 0:
# Default shift is 4 for match, otherwise 1
analyzer.shifts = 4 if args['match'] else 1
analyzer.shifts = 4 if args.get('match') else 1
analyzer.fail_on_error = not args['--continue-on-error']
return analyzer

Expand All @@ -305,6 +312,7 @@ def setup_matcher(args):
matcher.exact_count = args['--exact-count'] | args['--illustrate'] | args['--illustrate-hpf']
matcher.illustrate = args['--illustrate'] | args['--illustrate-hpf']
matcher.illustrate_hpf = args['--illustrate-hpf']
matcher.json = args.get('--json')
matcher.verbose = args['--verbose']
matcher.find_time_range = args['--find-time-range']
matcher.time_quantile = float(args['--time-quantile'])
Expand All @@ -316,12 +324,23 @@ def setup_reporter(args):
""" Creates a logging function, either to stderr or file"""
opfile = args['--opfile']
if opfile and len(opfile):
f = open(opfile, "w")

def report(msglist):
"""Log messages to a particular output file"""
for msg in msglist:
f.write(msg + "\n")
if args['--json']: # if args is JSON file, we need to serialize for each arg set
if os.path.exists(opfile): # following spirit of text-output, clobber existing file
os.unlink(opfile)
def report(msglist, is_data=False):
"""Log messages to a particular output file"""
# right now, if it's a string, dump to console, otherwise we assume
# it's a friendly object and pass through to JSON append function
if not is_data:
print(msglist)
else:
audfprint_match.json_append_objs(opfile, msglist)
else:
f = open(opfile, "w")
def report(msglist, is_data=False):
"""Log messages to a particular output file"""
for msg in msglist:
f.write(msg + "\n")
else:
def report(msglist):
"""Log messages by printing to stdout"""
Expand Down Expand Up @@ -374,13 +393,14 @@ def report(msglist):
-v <val>, --verbose <val> Verbosity level [default: 1]
-I, --illustrate Make a plot showing the match
-J, --illustrate-hpf Plot the match, using onset enhancement
-O, --json Return json object instead of string
-W <dir>, --wavdir <dir> Find sound files under this dir [default: ]
-V <ext>, --wavext <ext> Extension to add to wav file names [default: ]
--version Report version number
--help Print this message
"""

__version__ = 20150406
__version__ = 20190116


def main(argv):
Expand Down Expand Up @@ -480,8 +500,8 @@ def main(argv):
skip_existing=args['--skip-existing'],
strip_prefix=args['--wavdir'])

elapsedtime = time_clock() - initticks
if analyzer and analyzer.soundfiletotaldur > 0.:
elapsedtime = time.clock() - initticks
if analyzer and analyzer.soundfiletotaldur > 0. and not args['--verbose']:
print("Processed "
+ "%d files (%.1f s total dur) in %.1f s sec = %.3f x RT" \
% (analyzer.soundfilecount, analyzer.soundfiletotaldur,
Expand Down
11 changes: 6 additions & 5 deletions audfprint_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def find_peaks(self, d, sr):
n_hop/sr secs), second is the FFT bin (in units of sr/n_fft
Hz).
"""

if len(d) == 0:
return []

Expand Down Expand Up @@ -340,7 +341,9 @@ def wavfile2peaks(self, filename, shifts=None):
list of (time, bin) pairs. If specified, resample to sr first.
shifts > 1 causes hashes to be extracted from multiple shifts of
waveform, to reduce frame effects. """
ext = os.path.splitext(filename)[1]

_, ext = os.path.splitext(filename)

if ext == PRECOMPPKEXT:
# short-circuit - precomputed fingerprint file
peaks = peaks_load(filename)
Expand Down Expand Up @@ -414,8 +417,6 @@ def wavfile2hashes(self, filename):
]).astype(np.int32)
hashes = unique_hashes
# Or simply np.unique(query_hashes, axis=0) for numpy >= 1.13

# print("wavfile2hashes: read", len(hashes), "hashes from", filename)
return hashes

# ########## functions to link to actual hash table index database ###### #
Expand Down Expand Up @@ -443,7 +444,7 @@ def ingest(self, hashtable, filename):
# n_fft=n_fft,
# n_hop=n_hop)))
hashes = self.wavfile2hashes(filename)
hashtable.store(filename, hashes)
hashtable.store(filename, hashes, self.density)
# return (len(d)/float(sr), len(hashes))
# return (np.max(hashes, axis=0)[0]*n_hop/float(sr), len(hashes))
# soundfiledur is set up in wavfile2hashes, use result here
Expand Down Expand Up @@ -562,7 +563,7 @@ def glob2hashtable(pattern, density=20.0):
totdur = 0.0
tothashes = 0
for ix, file_ in enumerate(filelist):
print(time.ctime(), "ingesting #", ix, ":", file_, "...")
print(time.ctime(), "ingesting #", ix, ":", file_, track_duration(ix), ht.densityperid[ix], "...")
dur, nhash = g2h_analyzer.ingest(ht, file_)
totdur += dur
tothashes += nhash
Expand Down
Loading