diff --git a/AUTHORS.rst b/AUTHORS.rst index 0f98385..abd11dd 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -9,7 +9,7 @@ Software Sustainability Institute Hackday 2016 Team * Laurence Billingham * Martin Hammitzsch * Steve Harris -* Craig MacLachlan +* Craig MacLachlan Contributors ------------ diff --git a/commit_opener/commit_opener.py b/commit_opener/commit_opener.py index 9404d58..9c3fdc8 100644 --- a/commit_opener/commit_opener.py +++ b/commit_opener/commit_opener.py @@ -4,22 +4,27 @@ import pandas as pd from shutil import rmtree -from . tree_scrape import author_minded -from . query_pmc import pmc_data + +from commit_opener.grab_dependencies import get_dependencies +from commit_opener.tree_scrape import author_minded +from commit_opener.query_pmc import pmc_data as pubmed_data OUT_SUBFOLDER = 'contrib_data' AUTHOR_DATA = 'author_data.json' + def verify_local_repo_location(repo): if not os.path.isdir(repo): raise IOError('could not locate repository {}'.format(repo)) + def build_out_path(repo_name, parent_path=None): if parent_path is None: parent_path = os.path.abspath(os.curdir) out_path = os.path.join(parent_path, repo_name, OUT_SUBFOLDER) return out_path + def make_output_folder(path_, overwrite): if not os.path.exists(path_): os.mkdir(path_) @@ -27,14 +32,15 @@ def make_output_folder(path_, overwrite): rmtree(path_) os.mkdir(path_) + @click.command() -@click.option('--repo', prompt='git repository location', help='path to folder containing .git repository or url') +@click.option('--repo', prompt='git repository location', + help='path to folder containing .git repository or url') @click.option('--out_dir', default=None, - help='parent dir for output data, default same as .git folder scraped') + help='parent dir for output data, default same as .git folder scraped') @click.option('--clobber_output', default=True, - help='should we overwrite existing data?, default True') -@click.option('--verbose/--no-verbose', default=False) - + help='should we overwrite existing data?, default True') +@click.option('--verbose/--no-verbose', default=True) def main(repo, out_dir, clobber_output, verbose): """ """ import logging @@ -55,10 +61,18 @@ def main(repo, out_dir, clobber_output, verbose): repo_name = os.path.basename(repo) make_output_folder(out_dir, overwrite=clobber_output) contributor_data = author_minded(repo) - citation_data = pmc_data('SPSS') - logging.info("output path: %s" % os.path.join(out_dir,'contributor_data.json')) - contributor_data.to_json(os.path.join(out_dir,'contributor_data.json'), date_format='iso') - citation_data['citations'].to_json(os.path.join(out_dir,'citation_data.json')) + citation_data = pubmed_data('SPSS') + depends_data = get_dependencies(repo_name, repo) + logging.info('got dependency data of type {}'.format(type(depends_data))) + logging.info('got dependency data:\n {}'.format(depends_data)) + logging.info("output path: %s" % os.path.join(out_dir, + 'contributor_data.json')) + contributor_data.to_json(os.path.join(out_dir, + 'contributor_data.json'), + date_format='iso') + citation_data['citations'].to_json(os.path.join(out_dir, + 'citation_data.json')) + depends_data.to_json(os.path.join(out_dir, 'dependencies_data.json')) if __name__ == '__main__': main() diff --git a/commit_opener/depsy.py b/commit_opener/depsy.py new file mode 100644 index 0000000..a44e64d --- /dev/null +++ b/commit_opener/depsy.py @@ -0,0 +1,133 @@ +import re +import pickle +import ast +import os.path +import errno +import requests + +# """Functions from depsy""" + + +def parse_requirements_txt(contents): + # see here for spec used in parsing the file: + # https://pip.readthedocs.org/en/1.1/requirements.html#the-requirements-file-format + # it doesn't mention the '#' comment but found it often in examples. + # not using this test str in the function, just a handy place to keep it. + test_str = """# my comment +file://blahblah +foo==10.2 +baz>=3.6 +# other comment +foo.bar>=3.33 +foo-bar==2.2 +foo_bar==1.1 +foo == 5.5 +.for some reason there is a dot sometimes +--index-url blahblah +-e http://blah + foo_with_space_in_front = 1.1""" + + reqs = re.findall( + '^(?!file:|-|\.)\s*([\w\.-]+)', + contents, + re.MULTILINE | re.IGNORECASE + ) + return sorted(reqs) + + +def parse_setup_py(contents): + parsed = ast.parse(contents) + ret = [] + # see ast docs: https://greentreesnakes.readthedocs.org/en/latest/index.html + for node in ast.walk(parsed): + try: + if node.func.id == "setup": + for keyword in node.keywords: + if keyword.arg == "install_requires": + print("found requirements in setup.py 'install_requires' arg") + for elt in keyword.value.elts: + ret.append(_clean_setup_req(elt.s)) + + if keyword.arg == "requires": + print("found requirements in setup.py 'requires' arg") + for elt in keyword.value.elts: + ret.append(_clean_setup_req(elt.s)) + + if keyword.arg == "extras_require": + print("found requirements in setup.py 'extras_require' arg") + for my_list in keyword.value.values: + for elt in my_list.elts: + ret.append(_clean_setup_req(elt.s)) + + except AttributeError: + continue + + return sorted(ret) + + +class PythonStandardLibs(): + + def __init__(self): + self.url = "https://docs.python.org/2.7/py-modindex.html" + self.data_dir = os.path.join(os.path.dirname(__file__), + "../../data") + + self.pickle_path = os.path.join(self.data_dir, + "python_standard_libs.pickle") + self.libs = None + + def _mkdir(self): + try: + os.makedirs(self.data_dir) + except OSError as exp: + if exp.errno != errno.EEXIST: + raise + self.pickle_path = os.path.join(self.data_dir, + "python_standard_libs.pickle") + + def retrieve_from_web(self): + # only needs to be used once ever, here for tidiness + # checked the result into source control as python_standard_libs.pickle + html = requests.get(self.url).text + exp = r'class="xref">([^<]+)' + matches = re.findall(exp, html) + self.libs = [m for m in matches if '.' not in m] + + def pickle_libs(self): + + if self.libs is None: + self.retrieve_from_web() + + self._mkdir() + with open(self.pickle_path, "wb") as f: + pickle.dump(self.libs, f) + + print("saved these to file: {}".format(self.libs)) + + def get(self): + if self.libs is None: + try: + with open(self.pickle_path, "rb") as f: + print("Loading list of Stdandard Python Libraries from pickle file") + self.libs = pickle.load(f) + except: + self.retrieve_from_web() + self.pickle_libs() + + def clean(self): + try: + os.remove(self.pickle_path) + except: + pass + + +def save_python_standard_libs(clean=False): + pystdlibs = PythonStandardLibs() + if clean: + pystdlibs.clean() + pystdlibs.get() + + # to show the thing works + new_libs_obj = PythonStandardLibs() + new_libs_obj.get() + print("got these from pickled file: {}".format(new_libs_obj.libs)) diff --git a/commit_opener/grab_dependencies.py b/commit_opener/grab_dependencies.py index 5729bc8..38d8b87 100644 --- a/commit_opener/grab_dependencies.py +++ b/commit_opener/grab_dependencies.py @@ -1,8 +1,104 @@ """ -Extract the dependencies from the repository +Extract the dependencies from the repository Issue: work out dependencies #3 https://github.com/lbillingham/commit_opener/issues/3 -""" \ No newline at end of file +The key function is get_dependencies(). + +""" +import re +import os +import pandas + +from commit_opener import depsy +from commit_opener import repo + +def catfile(filename): + """Get text contents of a file.""" + + with open(filename, 'r') as fhandle: + print("Opening file {} and reading contents".format(filename)) + text = fhandle.read() + return text + + +def get_dependencies(name, url): + """ + Get the dependecies for a git repository or any local python package. + + """ + # Let's instantiate the repo object, so we can parse through it. + myrepo = repo.Repo(name, url) + print("Created a repository instance for {}".format(url)) + + # Extract a local copy + myrepo.extract_local_copy() + print("Local copy now available here: {}".format(myrepo.tmpdir)) + myrepo._get_filelist() + + # Note: the file has to be opened and read before passing to depsy + # functions. + if myrepo.has("requirements.txt"): + print("Repository has a requirements.txt file") + filetext = catfile(myrepo.has("requirements.txt")) + reqs = depsy.parse_requirements_txt(filetext) + elif myrepo.has("setup.py"): + print("Repository has a setup.py file") + filetext = catfile(myrepo.has("setup.py")) + reqs = depsy.parse_setup_py(filetext) + if len(reqs) < 1: + print("No reqs in setup file," + "so determining dependencies ourselves.") + reqs = search_files_for_imports(myrepo) + else: + # No standard descriptions of the dependencies so let's try to work + # them out for ourselves. + print("No req or setup file, so determining dependencies ourselves.") + reqs = search_files_for_imports(myrepo) + + # Convert the list of requirements to a set. + reqs = set(reqs) + print("Found the following imports: {}".format("\n".join(reqs))) + + # Get the list of standard packages so that these can be removed. + stdlibs = depsy.PythonStandardLibs() + stdlibs.get() + set_std_libs = set(stdlibs.libs) + + + data = pandas.Series(list(reqs-set_std_libs)) + data.sort_values(inplace=True) + return data + + + +def search_files_for_imports(repo_instance): + """ + Walk all the python files in the repository and extract the import info. + + """ + dep_list = [] + for f in repo_instance.file_list: + if ".py" in f: + print("Looking in {} for imports".format(os.path.basename(f))) + filetext = catfile(f) + dep_list.extend(find_imports(filetext)) + + return dep_list + + +def find_imports(text): + """Apply regular expression searching to a file""" + # list of regexes. + reexps = [re.compile('^[\si]+mport\s+(\w+)[\s\.]', re.MULTILINE), + re.compile('^[\sf]+rom\s+(\w+)[\s\.]+', re.MULTILINE) + ] + import_list = [] + for myregex in reexps: + try: + import_list.extend(re.findall(myregex, text)) + except AttributeError: + pass + return import_list diff --git a/repo.py b/commit_opener/repo.py similarity index 57% rename from repo.py rename to commit_opener/repo.py index 96d9972..4c0ff8f 100644 --- a/repo.py +++ b/commit_opener/repo.py @@ -1,5 +1,5 @@ """ -This module contains a class that allows us to interact with the +This module contains a class that allows us to interact with the repository of interest. """ @@ -7,89 +7,98 @@ import shutil import subprocess import os +import os.path + class Repo(object): """ Interact with a repository: attributes, extract a copy, cleanup. - + This could maybe be re-written as a context manger so the cleanup happens automatically. - + """ - + def __init__(self, name, url, rtype="git"): """ name - Name of the project url - url of the project rtype - type of the repository. This will mostly be git. - + """ self.name = name self.url = url self.rtype = rtype - + self.local_resources = [] self.extracted = False self.file_list = [] self.tmpdir = None - - + def extract_local_copy(self): """Extract a local copy of the repository""" - - self.tmpdir = tempfile.mkdtemp() - self.local_resources.append(self.tmpdir) - if self.rtype is "git": - extract_cmd = "git clone {url} {odir}".format(url=self.url, - odir=self.tmpdir) - else: - # We could implement SVN here, a quick svn export would do. - raise NotImplemented - - try: - subprocess.check_call(extract_cmd.split()) - except subprocess.CalledProcessError: - raise IOError("Unable to extract a local copy of repository") + if "http" not in self.url: + if os.path.exists(self.url): + print("Repository exists locally") + self.tmpdir = self.url + self.extracted = True + return + else: + raise IOError("Path to repository doesn't exist") + else: - self.extracted = True - + print("Extracting local copy of repository") + self.tmpdir = tempfile.mkdtemp() + self.local_resources.append(self.tmpdir) + print("Created temporary directory") + if self.rtype is "git": + extract_cmd = "git clone {url} {odir}".format(url=self.url, + odir=self.tmpdir) + else: + # We could implement SVN here, a quick svn export would do. + raise NotImplemented + + try: + subprocess.check_call(extract_cmd.split()) + except subprocess.CalledProcessError: + raise IOError("Unable to extract a local copy of repository") + else: + self.extracted = True + def has(self, filename): """ Does the repository have a file matching a particular name? If it does then return the filename, otherwise return False. - + """ if not self.extracted: self.extract_local_copy() - + if not self.file_list: self._get_filelist() - + for f in self.file_list: if filename in f: return f - + return False - + def _get_filelist(self): """Just get a list of the files in the repo.""" - + if not self.extracted: - self.extract_local_copy() - + self.extract_local_copy() + for root, dirs, files in os.walk(self.tmpdir, topdown=True): for name in files: + print(os.path.join(root, name)) self.file_list.append(os.path.join(root, name)) - - + def cleanup(self): """Remove any local resources""" - + for resource in self.local_resources: try: shutil.rmtree(resource) except: - print "Unable to remove: {}".format(resource) - - - + print("Unable to remove: {}".format(resource)) diff --git a/grab_dependencies.py b/grab_dependencies.py deleted file mode 100644 index af5cc0f..0000000 --- a/grab_dependencies.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Extract the dependencies from the repository - -Issue: -work out dependencies #3 -https://github.com/lbillingham/commit_opener/issues/3 - -""" - -def catfile(filename): - """Get text contents of a file.""" - - with open(filename, 'r') as fhandle: - return "\n".join(fhandle.read()) - - -def get_dependencies(name, url): - - # Let's instantiate the repo object, so we can parse through it. - myrepo = repo.Repo(name, url) - - # Extract a local copy - myrepo.extract_local_copy() - - # Note: the file has to be opened and read before passing to depsy - # functions. - if myrepo.has("requirements.txt"): - filetext = catfile(myrepo.has("requirements.txt")) - reqs = depsy.models.python(filetext) - elif myrepo.has("setup.py"): - filetext = catfile(myrepo.has("setup.py")) - reqs = depsy.models.parse_setup_py(filetext) - else: - # No standard descriptions of the dependencies so let's try to work - # them out for ourselves. - pass - - diff --git a/tests/test_grab_dependencies.py b/tests/test_grab_dependencies.py new file mode 100644 index 0000000..757985d --- /dev/null +++ b/tests/test_grab_dependencies.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +test_commit_opener +---------------------------------- + +Tests for `commit_opener` module. +""" + +import commit_opener.grab_dependencies as co_grab + + +def test_import_search(): + text = (""" +import os +import scipy +import pandas +from numpy import something +import matplotlib.pyplot as plt +""") + # Ordering matters here. Normal imports done first, then froms. + expected = ['os', 'scipy', 'pandas', 'matplotlib', 'numpy'] + assert expected == co_grab.find_imports(text) + + +def test_commented(): + text = (""" +import os +#import scipy +""") + expected = ['os'] + assert expected == co_grab.find_imports(text) + + +def test_indented(): + text = (""" +import os + import scipy +""") + expected = ['os', 'scipy'] + assert expected == co_grab.find_imports(text)