diff --git a/.travis.yml b/.travis.yml index eebb013..33db94c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,11 @@ language: python python: - - "2.7" - - "3.3" - "3.4" - "3.5" + - "3.8" + - "3.9" # command to install dependencies install: -# - "pip install -r requirements.txt" - "pip install setuptools --upgrade; python setup.py install" # command to run tests script: nosetests diff --git a/README.md b/README.md index 139d58a..9f2a37d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ![SoundScrape!](http://i.imgur.com/nHAt2ow.png) -SoundScrape [![Build Status](https://travis-ci.org/Miserlou/SoundScrape.svg)](https://travis-ci.org/Miserlou/SoundScrape) [![Python 2](https://img.shields.io/badge/Python-2-brightgreen.svg)](https://pypi.python.org/pypi/soundscrape/) [![Python 3](https://img.shields.io/badge/Python-3-brightgreen.svg)](https://pypi.python.org/pypi/soundscrape/) [![PyPI](https://img.shields.io/pypi/v/soundscrape.svg)](https://pypi.python.org/pypi/SoundScrape) +SoundScrape [![Build Status](https://travis-ci.org/Miserlou/SoundScrape.svg)](https://travis-ci.org/Miserlou/SoundScrape) [![Python 3](https://img.shields.io/badge/Python-3-brightgreen.svg)](https://pypi.python.org/pypi/soundscrape/) [![PyPI](https://img.shields.io/pypi/v/soundscrape.svg)](https://pypi.python.org/pypi/SoundScrape) ============== **SoundScrape** makes it super easy to download artists from SoundCloud (and Bandcamp and MixCloud) - even those which don't have download links! It automatically creates ID3 tags as well (including album art), which is handy. diff --git a/setup.py b/setup.py index 6c9851a..cefc64a 100644 --- a/setup.py +++ b/setup.py @@ -48,10 +48,11 @@ 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Topic :: Internet :: WWW/HTTP', 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', ], diff --git a/soundscrape/__init__.py b/soundscrape/__init__.py index b4cc240..c3d10d7 100644 --- a/soundscrape/__init__.py +++ b/soundscrape/__init__.py @@ -1 +1 @@ -__version__ = '0.30.1' +__version__ = '0.31.0' diff --git a/soundscrape/soundscrape.py b/soundscrape/soundscrape.py index 4132423..849f543 100755 --- a/soundscrape/soundscrape.py +++ b/soundscrape/soundscrape.py @@ -1,8 +1,7 @@ #! /usr/bin/env python -from __future__ import unicode_literals - import argparse import demjson +import html import os import re import requests @@ -19,11 +18,16 @@ from os.path import dirname, exists, join from os import access, mkdir, W_OK +if sys.version_info.minor < 4: + html_unescape = html.parser.HTMLParser().unescape +else: + html_unescape = html.unescape + #################################################################### # Please be nice with this! -CLIENT_ID = '175c043157ffae2c6d5fed16c3d95a4c' -CLIENT_SECRET = '99a51990bd81b6a82c901d4cc6828e46' +CLIENT_ID = 'a3dd183a357fcff9a6943c0d65664087' +CLIENT_SECRET = '7e10d33e967ad42574124977cf7fa4b7' MAGIC_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' AGGRESSIVE_CLIENT_ID = 'OmTFHKYSMLFqnu2HHucmclAptedxWXkq' @@ -219,7 +223,7 @@ def process_soundcloud(vargs): tagged = tag_file(filename, artist=track_data['artist'], title=track_data['title'], - year='2016', + year='2018', genre='', album='', artwork_url='') @@ -232,6 +236,7 @@ def process_soundcloud(vargs): filenames.append(filename) else: + aggressive = False # This is is likely a 'likes' page. @@ -267,6 +272,7 @@ def process_soundcloud(vargs): aggressive = True filenames = [] + # this might be buggy data = get_soundcloud_api2_data(artist_id) for track in data['collection']: @@ -431,6 +437,8 @@ def download_tracks(client, tracks, num_tracks=sys.maxsize, downloadable=False, continue puts_safe(colored.green("Downloading") + colored.white(": " + track['title'])) + + if track.get('direct', False): location = track['stream_url'] else: @@ -457,7 +465,7 @@ def download_tracks(client, tracks, num_tracks=sys.maxsize, downloadable=False, filenames.append(filename) except Exception as e: puts_safe(colored.red("Problem downloading ") + colored.white(track['title'])) - puts_safe(e) + puts_safe(str(e)) return filenames @@ -541,7 +549,12 @@ def process_bandcamp(vargs): else: bc_url = 'https://' + artist_url + '.bandcamp.com/music' - filenames = scrape_bandcamp_url(bc_url, num_tracks=vargs['num_tracks'], folders=vargs['folders'], custom_path=vargs['path']) + filenames = scrape_bandcamp_url( + bc_url, + num_tracks=vargs['num_tracks'], + folders=vargs['folders'], + custom_path=vargs['path'], + ) # check if we have lists inside a list, which indicates the # scraping has gone recursive, so we must format the output @@ -576,11 +589,15 @@ def scrape_bandcamp_url(url, num_tracks=sys.maxsize, folders=False, custom_path= # so we call the scrape_bandcamp_url() method for each one if type(album_data) is list: for album_url in album_data: - filenames.append(scrape_bandcamp_url(album_url, num_tracks, folders, custom_path)) + filenames.append( + scrape_bandcamp_url( + album_url, num_tracks, folders, custom_path + ) + ) return filenames - artist = album_data["artist"] - album_name = album_data["album_name"] + artist = album_data.get("artist") + album_name = album_data.get("album_title") if folders: if album_name: @@ -647,21 +664,58 @@ def scrape_bandcamp_url(url, num_tracks=sys.maxsize, folders=False, custom_path= return filenames +def extract_embedded_json_from_attribute(request, attribute, debug=False): + """ + Extract JSON object embedded in an element's attribute value. + + The JSON is "sloppy". The native python JSON parser often can't deal, + so we use the more tolerant demjson instead. + + Args: + request (obj:`requests.Response`): HTTP GET response from which to extract + attribute (str): name of the attribute holding the desired JSON object + debug (bool, optional): whether to print debug messages + + Returns: + The embedded JSON object as a dict, or None if extraction failed + """ + try: + embed = request.text.split('{}="'.format(attribute))[1] + embed = html_unescape( + embed.split('"')[0] + ) + output = demjson.decode(embed) + if debug: + print( + 'extracted JSON: ' + + demjson.encode( + output, + compactly=False, + indent_amount=2, + ) + ) + except Exception as e: + output = None + if debug: + print(e) + return output + + def get_bandcamp_metadata(url): """ - Read information from the Bandcamp JavaScript object. + Read information from Bandcamp embedded JavaScript object notation. The method may return a list of URLs (indicating this is probably a "main" page which links to one or more albums), or a JSON if we can already parse album/track info from the given url. - The JSON is "sloppy". The native python JSON parser often can't deal, so we use the more tolerant demjson instead. """ request = requests.get(url) + output = {} try: - sloppy_json = request.text.split("var TralbumData = ") - sloppy_json = sloppy_json[1].replace('" + "', "") - sloppy_json = sloppy_json.replace("'", "\'") - sloppy_json = sloppy_json.split("};")[0] + "};" - sloppy_json = sloppy_json.replace("};", "}") - output = demjson.decode(sloppy_json) + for attr in ['data-tralbum', 'data-embed']: + output.update( + extract_embedded_json_from_attribute( + request, attr + ) + ) # if the JSON parser failed, we should consider it's a "/music" page, # so we generate a list of albums/tracks and return it immediately except Exception as e: @@ -680,14 +734,6 @@ def get_bandcamp_metadata(url): # according to http://stackoverflow.com/a/7323861 # (very unlikely, but better safe than sorry!) output['genre'] = ' '.join(s for s in tags) - # make sure we always get the correct album name, even if this is a - # track URL (unless this track does not belong to any album, in which - # case the album name remains set as None. - output['album_name'] = None - regex_album_name = r'album_title\s*:\s*"([^"]+)"\s*,' - match = re.search(regex_album_name, request.text, re.MULTILINE) - if match: - output['album_name'] = match.group(1) try: artUrl = request.text.split("\"tralbumArt\">")[1].split("\">")[0].split("href=\"")[1] diff --git a/tests/test.py b/tests/test.py index 626bf4b..1ec1a54 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,21 +1,20 @@ import glob import os -import re -import string import sys import unittest -import nose -from nose import case -from nose.pyversion import unbound_method -from nose import util - +from mutagen.mp3 import EasyMP3 from soundscrape.soundscrape import get_client from soundscrape.soundscrape import process_soundcloud from soundscrape.soundscrape import process_bandcamp -from soundscrape.soundscrape import process_mixcloud -from soundscrape.soundscrape import process_audiomack -from soundscrape.soundscrape import process_musicbed + + +def rm_mp3(): + """ deletes all ``*.mp3`` files in current directory + """ + for f in glob.glob('*.mp3'): + os.unlink(f) + class TestSoundscrape(unittest.TestCase): @@ -31,45 +30,33 @@ def test_get_client(self): self.assertTrue(bool(client)) def test_soundcloud(self): - for f in glob.glob('*.mp3'): - os.unlink(f) - + rm_mp3() mp3_count = len(glob.glob1('', "*.mp3")) vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 9223372036854775807, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://soundcloud.com/fzpz/revised', 'keep': True} process_soundcloud(vargs) new_mp3_count = len(glob.glob1('', "*.mp3")) self.assertTrue(new_mp3_count > mp3_count) - - for f in glob.glob('*.mp3'): - os.unlink(f) + rm_mp3() def test_soundcloud_hard(self): - for f in glob.glob('*.mp3'): - os.unlink(f) - + rm_mp3() mp3_count = len(glob.glob1('', "*.mp3")) vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 1, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'puptheband', 'keep': False} process_soundcloud(vargs) new_mp3_count = len(glob.glob1('', "*.mp3")) self.assertTrue(new_mp3_count > mp3_count) self.assertTrue(new_mp3_count == 1) # This used to be 3, but is now 'Not available in United States.' - - for f in glob.glob('*.mp3'): - os.unlink(f) + rm_mp3() def test_soundcloud_hard_2(self): - for f in glob.glob('*.mp3'): - os.unlink(f) - + rm_mp3() mp3_count = len(glob.glob1('', "*.mp3")) vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 1, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://soundcloud.com/lostdogz/snuggles-chapstick', 'keep': False} process_soundcloud(vargs) new_mp3_count = len(glob.glob1('', "*.mp3")) self.assertTrue(new_mp3_count > mp3_count) self.assertTrue(new_mp3_count == 1) # This used to be 3, but is now 'Not available in United States.' - - for f in glob.glob('*.mp3'): - os.unlink(f) + rm_mp3() # The test URL for this is no longer a WAV. Need a new testcase. # @@ -88,30 +75,35 @@ def test_soundcloud_hard_2(self): # os.unlink(f) def test_bandcamp(self): - for f in glob.glob('*.mp3'): - os.unlink(f) - + rm_mp3() mp3_count = len(glob.glob1('', "*.mp3")) vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 9223372036854775807, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://atenrays.bandcamp.com/track/who-u-think'} process_bandcamp(vargs) new_mp3_count = len(glob.glob1('', "*.mp3")) self.assertTrue(new_mp3_count > mp3_count) - - for f in glob.glob('*.mp3'): - os.unlink(f) + rm_mp3() def test_bandcamp_slashes(self): - for f in glob.glob('*.mp3'): - os.unlink(f) - + rm_mp3() mp3_count = len(glob.glob1('', "*.mp3")) vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 9223372036854775807, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://defill.bandcamp.com/track/amnesia-chamber-harvest-skit'} process_bandcamp(vargs) new_mp3_count = len(glob.glob1('', "*.mp3")) self.assertTrue(new_mp3_count > mp3_count) + rm_mp3() + + def test_bandcamp_html_entities(self): + rm_mp3() + vargs = {'path': '', 'folders': False, 'num_tracks': sys.maxsize, 'open': False, 'artist_url': 'https://anaalnathrakh.bandcamp.com/track/man-at-c-a-bonus-track'} + process_bandcamp(vargs) + mp3s = glob.glob('*.mp3') + self.assertEquals(1, len(mp3s)) + fn = mp3s[0] + self.assertTrue('CandA' in fn) + t = EasyMP3(fn)['title'] + self.assertTrue('C&A' in t[0]) + rm_mp3() - for f in glob.glob('*.mp3'): - os.unlink(f) # def test_musicbed(self): # for f in glob.glob('*.mp3'): @@ -131,11 +123,9 @@ def test_mixcloud(self): MixCloud is being blocked from Travis, interestingly. """ - for f in glob.glob('*.mp3'): - os.unlink(f) - - for f in glob.glob('*.m4a'): - os.unlink(f) + # rm_mp3() + # for f in glob.glob('*.m4a'): + # os.unlink(f) # shortest mix I could find that was still semi tolerable #mp3_count = len(glob.glob1('', "*.mp3")) @@ -146,11 +136,9 @@ def test_mixcloud(self): #new_m4a_count = len(glob.glob1('', "*.m4a")) #self.assertTrue((new_mp3_count > mp3_count) or (new_m4a_count > m4a_count)) - for f in glob.glob('*.mp3'): - os.unlink(f) - - for f in glob.glob('*.m4a'): - os.unlink(f) + # rm_mp3() + # for f in glob.glob('*.m4a'): + # os.unlink(f) # def test_audiomack(self): # for f in glob.glob('*.mp3'):