Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
language: python
python:
- "2.7"
- "3.3"
- "3.4"
- "3.5"
- "3.8"
- "3.9"
# command to install dependencies
install:
# - "pip install -r requirements.txt"
- "pip install setuptools --upgrade; python setup.py install"
# command to run tests
script: nosetests
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
![SoundScrape!](http://i.imgur.com/nHAt2ow.png)

SoundScrape [![Build Status](https://travis-ci.org/Miserlou/SoundScrape.svg)](https://travis-ci.org/Miserlou/SoundScrape) [![Python 2](https://img.shields.io/badge/Python-2-brightgreen.svg)](https://pypi.python.org/pypi/soundscrape/) [![Python 3](https://img.shields.io/badge/Python-3-brightgreen.svg)](https://pypi.python.org/pypi/soundscrape/) [![PyPI](https://img.shields.io/pypi/v/soundscrape.svg)](https://pypi.python.org/pypi/SoundScrape)
SoundScrape [![Build Status](https://travis-ci.org/Miserlou/SoundScrape.svg)](https://travis-ci.org/Miserlou/SoundScrape) [![Python 3](https://img.shields.io/badge/Python-3-brightgreen.svg)](https://pypi.python.org/pypi/soundscrape/) [![PyPI](https://img.shields.io/pypi/v/soundscrape.svg)](https://pypi.python.org/pypi/SoundScrape)
==============

**SoundScrape** makes it super easy to download artists from SoundCloud (and Bandcamp and MixCloud) - even those which don't have download links! It automatically creates ID3 tags as well (including album art), which is handy.
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@
'License :: OSI Approved :: Apache Software License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Topic :: Internet :: WWW/HTTP',
'Topic :: Internet :: WWW/HTTP :: Dynamic Content',
],
Expand Down
2 changes: 1 addition & 1 deletion soundscrape/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.30.1'
__version__ = '0.31.0'
98 changes: 72 additions & 26 deletions soundscrape/soundscrape.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#! /usr/bin/env python
from __future__ import unicode_literals

import argparse
import demjson
import html
import os
import re
import requests
Expand All @@ -19,11 +18,16 @@
from os.path import dirname, exists, join
from os import access, mkdir, W_OK

if sys.version_info.minor < 4:
html_unescape = html.parser.HTMLParser().unescape
else:
html_unescape = html.unescape

####################################################################

# Please be nice with this!
CLIENT_ID = '175c043157ffae2c6d5fed16c3d95a4c'
CLIENT_SECRET = '99a51990bd81b6a82c901d4cc6828e46'
CLIENT_ID = 'a3dd183a357fcff9a6943c0d65664087'
CLIENT_SECRET = '7e10d33e967ad42574124977cf7fa4b7'
MAGIC_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'

AGGRESSIVE_CLIENT_ID = 'OmTFHKYSMLFqnu2HHucmclAptedxWXkq'
Expand Down Expand Up @@ -219,7 +223,7 @@ def process_soundcloud(vargs):
tagged = tag_file(filename,
artist=track_data['artist'],
title=track_data['title'],
year='2016',
year='2018',
genre='',
album='',
artwork_url='')
Expand All @@ -232,6 +236,7 @@ def process_soundcloud(vargs):
filenames.append(filename)

else:

aggressive = False

# This is is likely a 'likes' page.
Expand Down Expand Up @@ -267,6 +272,7 @@ def process_soundcloud(vargs):
aggressive = True
filenames = []

# this might be buggy
data = get_soundcloud_api2_data(artist_id)

for track in data['collection']:
Expand Down Expand Up @@ -431,6 +437,8 @@ def download_tracks(client, tracks, num_tracks=sys.maxsize, downloadable=False,
continue

puts_safe(colored.green("Downloading") + colored.white(": " + track['title']))


if track.get('direct', False):
location = track['stream_url']
else:
Expand All @@ -457,7 +465,7 @@ def download_tracks(client, tracks, num_tracks=sys.maxsize, downloadable=False,
filenames.append(filename)
except Exception as e:
puts_safe(colored.red("Problem downloading ") + colored.white(track['title']))
puts_safe(e)
puts_safe(str(e))

return filenames

Expand Down Expand Up @@ -541,7 +549,12 @@ def process_bandcamp(vargs):
else:
bc_url = 'https://' + artist_url + '.bandcamp.com/music'

filenames = scrape_bandcamp_url(bc_url, num_tracks=vargs['num_tracks'], folders=vargs['folders'], custom_path=vargs['path'])
filenames = scrape_bandcamp_url(
bc_url,
num_tracks=vargs['num_tracks'],
folders=vargs['folders'],
custom_path=vargs['path'],
)

# check if we have lists inside a list, which indicates the
# scraping has gone recursive, so we must format the output
Expand Down Expand Up @@ -576,11 +589,15 @@ def scrape_bandcamp_url(url, num_tracks=sys.maxsize, folders=False, custom_path=
# so we call the scrape_bandcamp_url() method for each one
if type(album_data) is list:
for album_url in album_data:
filenames.append(scrape_bandcamp_url(album_url, num_tracks, folders, custom_path))
filenames.append(
scrape_bandcamp_url(
album_url, num_tracks, folders, custom_path
)
)
return filenames

artist = album_data["artist"]
album_name = album_data["album_name"]
artist = album_data.get("artist")
album_name = album_data.get("album_title")

if folders:
if album_name:
Expand Down Expand Up @@ -647,21 +664,58 @@ def scrape_bandcamp_url(url, num_tracks=sys.maxsize, folders=False, custom_path=
return filenames


def extract_embedded_json_from_attribute(request, attribute, debug=False):
"""
Extract JSON object embedded in an element's attribute value.

The JSON is "sloppy". The native python JSON parser often can't deal,
so we use the more tolerant demjson instead.

Args:
request (obj:`requests.Response`): HTTP GET response from which to extract
attribute (str): name of the attribute holding the desired JSON object
debug (bool, optional): whether to print debug messages

Returns:
The embedded JSON object as a dict, or None if extraction failed
"""
try:
embed = request.text.split('{}="'.format(attribute))[1]
embed = html_unescape(
embed.split('"')[0]
)
output = demjson.decode(embed)
if debug:
print(
'extracted JSON: '
+ demjson.encode(
output,
compactly=False,
indent_amount=2,
)
)
except Exception as e:
output = None
if debug:
print(e)
return output


def get_bandcamp_metadata(url):
"""
Read information from the Bandcamp JavaScript object.
Read information from Bandcamp embedded JavaScript object notation.
The method may return a list of URLs (indicating this is probably a "main" page which links to one or more albums),
or a JSON if we can already parse album/track info from the given url.
The JSON is "sloppy". The native python JSON parser often can't deal, so we use the more tolerant demjson instead.
"""
request = requests.get(url)
output = {}
try:
sloppy_json = request.text.split("var TralbumData = ")
sloppy_json = sloppy_json[1].replace('" + "', "")
sloppy_json = sloppy_json.replace("'", "\'")
sloppy_json = sloppy_json.split("};")[0] + "};"
sloppy_json = sloppy_json.replace("};", "}")
output = demjson.decode(sloppy_json)
for attr in ['data-tralbum', 'data-embed']:
output.update(
extract_embedded_json_from_attribute(
request, attr
)
)
# if the JSON parser failed, we should consider it's a "/music" page,
# so we generate a list of albums/tracks and return it immediately
except Exception as e:
Expand All @@ -680,14 +734,6 @@ def get_bandcamp_metadata(url):
# according to http://stackoverflow.com/a/7323861
# (very unlikely, but better safe than sorry!)
output['genre'] = ' '.join(s for s in tags)
# make sure we always get the correct album name, even if this is a
# track URL (unless this track does not belong to any album, in which
# case the album name remains set as None.
output['album_name'] = None
regex_album_name = r'album_title\s*:\s*"([^"]+)"\s*,'
match = re.search(regex_album_name, request.text, re.MULTILINE)
if match:
output['album_name'] = match.group(1)

try:
artUrl = request.text.split("\"tralbumArt\">")[1].split("\">")[0].split("href=\"")[1]
Expand Down
86 changes: 37 additions & 49 deletions tests/test.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import glob
import os
import re
import string
import sys
import unittest

import nose
from nose import case
from nose.pyversion import unbound_method
from nose import util

from mutagen.mp3 import EasyMP3
from soundscrape.soundscrape import get_client
from soundscrape.soundscrape import process_soundcloud
from soundscrape.soundscrape import process_bandcamp
from soundscrape.soundscrape import process_mixcloud
from soundscrape.soundscrape import process_audiomack
from soundscrape.soundscrape import process_musicbed


def rm_mp3():
""" deletes all ``*.mp3`` files in current directory
"""
for f in glob.glob('*.mp3'):
os.unlink(f)


class TestSoundscrape(unittest.TestCase):

Expand All @@ -31,45 +30,33 @@ def test_get_client(self):
self.assertTrue(bool(client))

def test_soundcloud(self):
for f in glob.glob('*.mp3'):
os.unlink(f)

rm_mp3()
mp3_count = len(glob.glob1('', "*.mp3"))
vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 9223372036854775807, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://soundcloud.com/fzpz/revised', 'keep': True}
process_soundcloud(vargs)
new_mp3_count = len(glob.glob1('', "*.mp3"))
self.assertTrue(new_mp3_count > mp3_count)

for f in glob.glob('*.mp3'):
os.unlink(f)
rm_mp3()

def test_soundcloud_hard(self):
for f in glob.glob('*.mp3'):
os.unlink(f)

rm_mp3()
mp3_count = len(glob.glob1('', "*.mp3"))
vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 1, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'puptheband', 'keep': False}
process_soundcloud(vargs)
new_mp3_count = len(glob.glob1('', "*.mp3"))
self.assertTrue(new_mp3_count > mp3_count)
self.assertTrue(new_mp3_count == 1) # This used to be 3, but is now 'Not available in United States.'

for f in glob.glob('*.mp3'):
os.unlink(f)
rm_mp3()

def test_soundcloud_hard_2(self):
for f in glob.glob('*.mp3'):
os.unlink(f)

rm_mp3()
mp3_count = len(glob.glob1('', "*.mp3"))
vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 1, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://soundcloud.com/lostdogz/snuggles-chapstick', 'keep': False}
process_soundcloud(vargs)
new_mp3_count = len(glob.glob1('', "*.mp3"))
self.assertTrue(new_mp3_count > mp3_count)
self.assertTrue(new_mp3_count == 1) # This used to be 3, but is now 'Not available in United States.'

for f in glob.glob('*.mp3'):
os.unlink(f)
rm_mp3()

# The test URL for this is no longer a WAV. Need a new testcase.
#
Expand All @@ -88,30 +75,35 @@ def test_soundcloud_hard_2(self):
# os.unlink(f)

def test_bandcamp(self):
for f in glob.glob('*.mp3'):
os.unlink(f)

rm_mp3()
mp3_count = len(glob.glob1('', "*.mp3"))
vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 9223372036854775807, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://atenrays.bandcamp.com/track/who-u-think'}
process_bandcamp(vargs)
new_mp3_count = len(glob.glob1('', "*.mp3"))
self.assertTrue(new_mp3_count > mp3_count)

for f in glob.glob('*.mp3'):
os.unlink(f)
rm_mp3()

def test_bandcamp_slashes(self):
for f in glob.glob('*.mp3'):
os.unlink(f)

rm_mp3()
mp3_count = len(glob.glob1('', "*.mp3"))
vargs = {'path':'', 'folders': False, 'group': False, 'track': '', 'num_tracks': 9223372036854775807, 'bandcamp': False, 'downloadable': False, 'likes': False, 'open': False, 'artist_url': 'https://defill.bandcamp.com/track/amnesia-chamber-harvest-skit'}
process_bandcamp(vargs)
new_mp3_count = len(glob.glob1('', "*.mp3"))
self.assertTrue(new_mp3_count > mp3_count)
rm_mp3()

def test_bandcamp_html_entities(self):
rm_mp3()
vargs = {'path': '', 'folders': False, 'num_tracks': sys.maxsize, 'open': False, 'artist_url': 'https://anaalnathrakh.bandcamp.com/track/man-at-c-a-bonus-track'}
process_bandcamp(vargs)
mp3s = glob.glob('*.mp3')
self.assertEquals(1, len(mp3s))
fn = mp3s[0]
self.assertTrue('CandA' in fn)
t = EasyMP3(fn)['title']
self.assertTrue('C&A' in t[0])
rm_mp3()

for f in glob.glob('*.mp3'):
os.unlink(f)

# def test_musicbed(self):
# for f in glob.glob('*.mp3'):
Expand All @@ -131,11 +123,9 @@ def test_mixcloud(self):
MixCloud is being blocked from Travis, interestingly.
"""

for f in glob.glob('*.mp3'):
os.unlink(f)

for f in glob.glob('*.m4a'):
os.unlink(f)
# rm_mp3()
# for f in glob.glob('*.m4a'):
# os.unlink(f)

# shortest mix I could find that was still semi tolerable
#mp3_count = len(glob.glob1('', "*.mp3"))
Expand All @@ -146,11 +136,9 @@ def test_mixcloud(self):
#new_m4a_count = len(glob.glob1('', "*.m4a"))
#self.assertTrue((new_mp3_count > mp3_count) or (new_m4a_count > m4a_count))

for f in glob.glob('*.mp3'):
os.unlink(f)

for f in glob.glob('*.m4a'):
os.unlink(f)
# rm_mp3()
# for f in glob.glob('*.m4a'):
# os.unlink(f)

# def test_audiomack(self):
# for f in glob.glob('*.mp3'):
Expand Down