From 2cd495fe5d4174761f0eda3d508c73fd9095e792 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sat, 4 Jun 2022 13:32:52 -0700 Subject: [PATCH 01/22] Add clone, referring, paths, and traffic sqlmodels --- github_stats_pages/models/__init__.py | 4 ++++ github_stats_pages/models/clone.py | 11 +++++++++++ github_stats_pages/models/paths.py | 12 ++++++++++++ github_stats_pages/models/referring.py | 12 ++++++++++++ github_stats_pages/models/traffic.py | 11 +++++++++++ setup.cfg | 1 + 6 files changed, 51 insertions(+) create mode 100644 github_stats_pages/models/__init__.py create mode 100644 github_stats_pages/models/clone.py create mode 100644 github_stats_pages/models/paths.py create mode 100644 github_stats_pages/models/referring.py create mode 100644 github_stats_pages/models/traffic.py diff --git a/github_stats_pages/models/__init__.py b/github_stats_pages/models/__init__.py new file mode 100644 index 0000000..054109e --- /dev/null +++ b/github_stats_pages/models/__init__.py @@ -0,0 +1,4 @@ +from .clone import Clone # noqa: F401 +from .referring import Referring # noqa: F401 +from .paths import Paths # noqa: F401 +from .traffic import Traffic # noqa: F401 diff --git a/github_stats_pages/models/clone.py b/github_stats_pages/models/clone.py new file mode 100644 index 0000000..f94d7d3 --- /dev/null +++ b/github_stats_pages/models/clone.py @@ -0,0 +1,11 @@ +from typing import Optional + +from sqlmodel import SQLModel, Field + + +class Clone(SQLModel, table=True): + id: Optional[int] = Field(default=None, primary_key=True) + repository_name: str + date: str + clones: int + unique_clones: int diff --git a/github_stats_pages/models/paths.py b/github_stats_pages/models/paths.py new file mode 100644 index 0000000..0184af4 --- /dev/null +++ b/github_stats_pages/models/paths.py @@ -0,0 +1,12 @@ +from typing import Optional + +from sqlmodel import SQLModel, Field + + +class Paths(SQLModel, table=True): + id: Optional[int] = Field(default=None, primary_key=True) + date: str + path: str + title: str + views: int + unique_views: int diff --git a/github_stats_pages/models/referring.py b/github_stats_pages/models/referring.py new file mode 100644 index 0000000..9d1081d --- /dev/null +++ b/github_stats_pages/models/referring.py @@ -0,0 +1,12 @@ +from typing import Optional + +from sqlmodel import SQLModel, Field + + +class Referring(SQLModel, table=True): + id: Optional[int] = Field(default=None, primary_key=True) + repository_name: str + site: str + date: str + views: int + unique_visitors: int diff --git a/github_stats_pages/models/traffic.py b/github_stats_pages/models/traffic.py new file mode 100644 index 0000000..7da603c --- /dev/null +++ b/github_stats_pages/models/traffic.py @@ -0,0 +1,11 @@ +from typing import Optional + +from sqlmodel import SQLModel, Field + + +class Traffic(SQLModel, table=True): + id: Optional[int] = Field(default=None, primary_key=True) + repository_name: str + date: str + views: int + unique_visitors: int diff --git a/setup.cfg b/setup.cfg index fc32e3e..1235676 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,6 +37,7 @@ install_requires = PyGithub == 1.55 tabulate == 0.8.7 rich >= 12.4.1, <13.0.0 + sqlmodel >= 0.0.6, < 1.0.0 scripts = scripts/get_repo_list scripts/gts_run_all_repos From a38823469cbbe22b0db504d9549c37e2461f3b54 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Fri, 24 Jun 2022 15:32:51 -0700 Subject: [PATCH 02/22] Add db module to support sqlite #89 - Update Paths model to have optional repository_name field - Create data directory for sqlite db if necessary --- github_stats_pages/db.py | 86 ++++++++++++++++++++++++++++++ github_stats_pages/models/paths.py | 1 + 2 files changed, 87 insertions(+) create mode 100644 github_stats_pages/db.py diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py new file mode 100644 index 0000000..706b4b2 --- /dev/null +++ b/github_stats_pages/db.py @@ -0,0 +1,86 @@ +from functools import partial +from pathlib import Path +from typing import Union, Type + +import pandas as pd +from sqlalchemy.future import Engine +from sqlalchemy.exc import NoResultFound +from sqlmodel import SQLModel, Session, create_engine, select + +from .models import Clone, Referring, Traffic, Paths +from .logger import app_log as log + +SQLITE_FILE_NAME = Path("data/sqlite3.db") + + +def configure(test: bool = False, echo: bool = False) -> Engine: + sqlite_file_name = ( + Path("tests_data/sqlite3.db") if test else SQLITE_FILE_NAME + ) + if not sqlite_file_name.parent.exists(): + sqlite_file_name.parent.mkdir() + sqlite_url = f"sqlite:///{sqlite_file_name}" + return create_engine(sqlite_url, echo=echo) + + +def create_db_and_tables(test: bool = False, echo: bool = False): + engine = configure(test=test, echo=echo) + SQLModel.metadata.create_all(engine) + return engine + + +def migrate_csv( + filename: Union[str, Path], + model: Type[SQLModel], + engine: Engine, + skip_rows: Union[int, None] = None, +): + """Migrate CSV over to SQLite""" + + names = list( + map( + lambda f: f.name, + filter(lambda x: x.required, model.__fields__.values()), + ) + ) + log.info(f"[yellow]Loading: {filename}") + df = pd.read_csv(filename, header=None, skiprows=skip_rows, names=names) + if isinstance(model, Paths): + repository_names = [a.split("/")[2] for a in df["path"].values] + df.insert(1, "repository_name", repository_names) + + func = partial(query, engine=engine, model=model) + + query_results = list(map(func, df["repository_name"], df["date"])) + new_df: pd.DataFrame = df.iloc[ + [idx for idx, item in enumerate(query_results) if not item] + ] + if new_df.empty: + log.info("No new records!") + else: + log.info(f"New records found: {len(new_df)}") + log.info("[bold yellow]Adding data") + new_df.to_sql( + model.__name__.lower(), engine, if_exists="append", index=False + ) + if len(new_df) < len(df): + log.info("[orange]Some records exists in db") + + +def query( + repository_name: str, + date: str, + engine: Engine, + model: Union[Type[SQLModel], Clone, Referring, Paths, Traffic], +) -> Union[SQLModel, Clone, Referring, Paths, Traffic, None]: + + with Session(engine) as session: + result = session.exec( + select(model).where( + model.repository_name == repository_name, model.date == date + ) + ) + try: + return result.one() + except NoResultFound: + return diff --git a/github_stats_pages/models/paths.py b/github_stats_pages/models/paths.py index 0184af4..6b2cb97 100644 --- a/github_stats_pages/models/paths.py +++ b/github_stats_pages/models/paths.py @@ -6,6 +6,7 @@ class Paths(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) date: str + repository_name: Optional[str] path: str title: str views: int From bd0e949e401241875ebd0026919f1a4716ad6c9e Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Fri, 24 Jun 2022 15:37:19 -0700 Subject: [PATCH 03/22] Add migrate_to_sqlite script - Migrate referring data to include date stamp --- entrypoint.sh | 4 +++- scripts/migrate_to_sqlite | 47 +++++++++++++++++++++++++++++++++++++++ setup.cfg | 1 + 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100755 scripts/migrate_to_sqlite diff --git a/entrypoint.sh b/entrypoint.sh index c8d86eb..98d6049 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -21,6 +21,8 @@ else test="--test" fi +migrate_to_sqlite + get_repo_list -u $1 gts_run_all_repos -u $1 -t $2 -c "$1".csv ${test} @@ -29,5 +31,5 @@ if [ $ret -ne 0 ]; then exit 1 fi -merge-csv.sh . +# merge-csv.sh . make_stats_plots -u $1 -t $2 -c "$1".csv -o ./public ${include_repos} ${exclude_repos} diff --git a/scripts/migrate_to_sqlite b/scripts/migrate_to_sqlite new file mode 100755 index 0000000..0403a30 --- /dev/null +++ b/scripts/migrate_to_sqlite @@ -0,0 +1,47 @@ +#!/usr/bin/env python +from pathlib import Path + +import pandas as pd + +from github_stats_pages import db +from github_stats_pages.logger import app_log as log +from github_stats_pages.models import Clone, Traffic, Referring, Paths + +DROP_DUPLICATES_SUBSET = ["date", "repository_name", "site"] + + +if __name__ == "__main__": + log.info("[bold yellow]Running migrate_to_sqlite script") + + sql_path = Path(db.SQLITE_FILE_NAME) + if sql_path.exists(): + log.info("SQLite DB exists!") + engine = db.create_db_and_tables() + + p_data = Path("data") + + # Handle referrer files (missing date field) + referrer_files = list(p_data.glob("*referrer-stats.csv")) + log.info(f"Number of referrer files: {len(referrer_files)}") + referrer_merged_df = pd.DataFrame() + for r_file in referrer_files: + file_date = r_file.name.rstrip("data/")[:10] + r_df = pd.read_csv(r_file) + r_df.insert(loc=0, column="date", value=file_date) + referrer_merged_df = referrer_merged_df.append(r_df, ignore_index=True) + if not referrer_merged_df.empty: + referrer_merged_df.drop_duplicates( + subset=DROP_DUPLICATES_SUBSET, keep="last", inplace=True + ) + log.info(f"Referrer record number: {len(referrer_merged_df)}") + referrer_outfile = p_data / "merged_referrer.csv" + log.info(f"Writing: {referrer_outfile}") + referrer_merged_df.to_csv(referrer_outfile, header=False, index=False) + + merged_files = [x for x in sorted(p_data.glob("merged_*.csv"))] + if merged_files: + model_list = [Clone, Paths, Referring, Traffic] + for file, model in zip(merged_files, model_list): + db.migrate_csv(file, model=model, engine=engine) + else: + log.info("No merged files to migrate!") diff --git a/setup.cfg b/setup.cfg index 1235676..7b9bfc6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,7 @@ scripts = scripts/gts_run_all_repos scripts/merge-csv.sh scripts/make_stats_plots + scripts/migrate_to_sqlite #package_dir= # =github_stats_pages packages = From 5c001acaca295abb60166930473bcde10a6cf28a Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 26 Jun 2022 09:57:29 -0700 Subject: [PATCH 04/22] stats_plots: Load data from sqlite - Add query_all() in db module - Ensure that dict_df has all tables even if empty - Remove data-dir option in make_stats_plots --- github_stats_pages/db.py | 14 +++++++++++++- github_stats_pages/stats_plots.py | 22 +++++++++++++--------- scripts/make_stats_plots | 7 ------- tests/test_stats_plots.py | 2 +- 4 files changed, 27 insertions(+), 18 deletions(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 706b4b2..377aeb9 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -1,6 +1,6 @@ from functools import partial from pathlib import Path -from typing import Union, Type +from typing import List, Type, Union import pandas as pd from sqlalchemy.future import Engine @@ -20,6 +20,7 @@ def configure(test: bool = False, echo: bool = False) -> Engine: if not sqlite_file_name.parent.exists(): sqlite_file_name.parent.mkdir() sqlite_url = f"sqlite:///{sqlite_file_name}" + log.info(f"Configuring SQLite at: {sqlite_url}") return create_engine(sqlite_url, echo=echo) @@ -84,3 +85,14 @@ def query( return result.one() except NoResultFound: return + + +def query_all( + engine: Engine, + model: Union[Type[SQLModel], Clone, Referring, Paths, Traffic], +) -> List[Union[SQLModel, Clone, Referring, Paths, Traffic]]: + """Retrieve an entire table""" + + with Session(engine) as session: + result = session.exec(select(model)) + return result.all() diff --git a/github_stats_pages/stats_plots.py b/github_stats_pages/stats_plots.py index 934c31e..6476854 100644 --- a/github_stats_pages/stats_plots.py +++ b/github_stats_pages/stats_plots.py @@ -15,6 +15,8 @@ import pandas as pd from .logger import app_log as log +from . import db +from .models import Clone, Traffic prefix = "merged" stats_type = ["traffic", "clone"] @@ -29,23 +31,25 @@ main_p = Path(__file__).parent -def load_data(data_dir: str) -> Dict[str, pd.DataFrame]: +def load_data(test: bool = False) -> Dict[str, pd.DataFrame]: """ Load stats CSV as dict of pandas DataFrame - :param data_dir: Path containing merged*.csv :return: Dict of pandas DataFrame """ - p = Path(data_dir) / "data" + engine = db.create_db_and_tables(test=test) dict_df = {} - for stats in stats_type: - stat_file = p / f"{prefix}_{stats}.csv" - names = r_columns if stats == "referrer" else columns - dict_df[stats] = pd.read_csv(stat_file, header=None, names=names) - + for stats, m in zip(stats_type, [Traffic, Clone]): + records = [i.dict() for i in db.query_all(engine, m)] + if records: + dict_df[stats] = pd.DataFrame.from_records(records, index="id") + else: + log.warning(f"[bold red]No data in {stats} table!") + names = r_columns if stats == "referrer" else columns + dict_df[stats] = pd.DataFrame(columns=names) return dict_df @@ -247,7 +251,7 @@ def make_plots( (~repository_df["fork"]) & (~repository_df["archived"]) ] - dict_df = load_data(data_dir) + dict_df = load_data() # Add repo folder for all static repo pages p_repos = Path(out_dir) / "repos" diff --git a/scripts/make_stats_plots b/scripts/make_stats_plots index 29084af..81f8b4d 100755 --- a/scripts/make_stats_plots +++ b/scripts/make_stats_plots @@ -17,13 +17,6 @@ if __name__ == "__main__": parser.add_argument( "-t", "--token", default="", help="GitHub API token" ) # Avoids rate limiting - parser.add_argument( - "-d", - "--data-dir", - default=Path.cwd(), - help="""Folder path containing merge CSV files - 'Default: current working directory""", - ) parser.add_argument( "-o", "--out-dir", diff --git a/tests/test_stats_plots.py b/tests/test_stats_plots.py index 29138df..0bf8f84 100644 --- a/tests/test_stats_plots.py +++ b/tests/test_stats_plots.py @@ -10,7 +10,7 @@ def test_load_data(): - dict_df = stats_plots.load_data(tests_data_folder) + dict_df = stats_plots.load_data(test=True) assert isinstance(dict_df, dict) From ee9f7f68f57ec67ee835673c6fd884c8e0735ca6 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 26 Jun 2022 10:14:33 -0700 Subject: [PATCH 05/22] gts_run_all_repos: Migrate new CSV to sqlite --- scripts/gts_run_all_repos | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/gts_run_all_repos b/scripts/gts_run_all_repos index 5f2c3c5..5a9be20 100755 --- a/scripts/gts_run_all_repos +++ b/scripts/gts_run_all_repos @@ -6,6 +6,7 @@ import pandas as pd from github_stats_pages import gts_run from github_stats_pages.logger import app_log as log +from github_stats_pages import db def read_csv(csv_file: str) -> pd.DataFrame: @@ -52,14 +53,14 @@ if __name__ == "__main__": gts_run.run_each_repo(args.user, args.token, repo_name, save_csv=True) gts_run.get_top_paths(args.user, args.token, repo_name, save_csv=True) - # Save files in a data folder - log.info("[yellow]Moving records to data/folder") + log.info("[yellow]Moving CSV records to SQLite") + engine = db.create_db_and_tables() + p_cwd = Path.cwd() p_data = p_cwd / "data" - if not p_data.exists(): - p_data.mkdir() - for f in p_cwd.glob("????-??-??-???-???-*stats.csv"): - f.rename(p_data / f.name) + for datatype, model in zip(["clone", "traffic"], [db.Clone, db.Traffic]): + for f in p_cwd.glob(f"????-??-??-???-???-*{datatype}-stats.csv"): + db.migrate_csv(f, model, engine, skip_rows=1) log.info("[dark_green]gts_run_all_repos script completed!") From cab1d1a7d971aa13097e532d4e0651c9fcb18f14 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 3 Jul 2022 13:34:35 -0700 Subject: [PATCH 06/22] Adjust model fields to be consistent with GitHub UI terms - Add column names in stats_plots - Add sqlite3 tests_data for some data - Update unit tests --- github_stats_pages/models/clone.py | 4 ++-- github_stats_pages/models/paths.py | 2 +- github_stats_pages/models/referring.py | 4 ++-- github_stats_pages/models/traffic.py | 2 +- github_stats_pages/stats_plots.py | 29 +++++++++++++++---------- tests/test_stats_plots.py | 2 +- tests_data/sqlite3.db | Bin 0 -> 24576 bytes 7 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 tests_data/sqlite3.db diff --git a/github_stats_pages/models/clone.py b/github_stats_pages/models/clone.py index f94d7d3..be1a9e5 100644 --- a/github_stats_pages/models/clone.py +++ b/github_stats_pages/models/clone.py @@ -7,5 +7,5 @@ class Clone(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) repository_name: str date: str - clones: int - unique_clones: int + total: int + unique: int diff --git a/github_stats_pages/models/paths.py b/github_stats_pages/models/paths.py index 6b2cb97..25cf34e 100644 --- a/github_stats_pages/models/paths.py +++ b/github_stats_pages/models/paths.py @@ -10,4 +10,4 @@ class Paths(SQLModel, table=True): path: str title: str views: int - unique_views: int + unique: int diff --git a/github_stats_pages/models/referring.py b/github_stats_pages/models/referring.py index 9d1081d..750a280 100644 --- a/github_stats_pages/models/referring.py +++ b/github_stats_pages/models/referring.py @@ -8,5 +8,5 @@ class Referring(SQLModel, table=True): repository_name: str site: str date: str - views: int - unique_visitors: int + total: int + unique: int diff --git a/github_stats_pages/models/traffic.py b/github_stats_pages/models/traffic.py index 7da603c..1842f67 100644 --- a/github_stats_pages/models/traffic.py +++ b/github_stats_pages/models/traffic.py @@ -8,4 +8,4 @@ class Traffic(SQLModel, table=True): repository_name: str date: str views: int - unique_visitors: int + unique: int diff --git a/github_stats_pages/stats_plots.py b/github_stats_pages/stats_plots.py index 6476854..5c344ad 100644 --- a/github_stats_pages/stats_plots.py +++ b/github_stats_pages/stats_plots.py @@ -20,8 +20,9 @@ prefix = "merged" stats_type = ["traffic", "clone"] -columns = ["repository_name", "date", "total", "unique"] -r_columns = ["repository_name", "source", "total", "unique"] # For referrer +c_columns = ["repository_name", "date", "total", "unique"] +r_columns = ["repository_name", "date", "source", "total", "unique"] +t_columns = ["repository_name", "date", "views", "unique"] TOOLTIPS = [ ("index", "$index"), @@ -48,7 +49,13 @@ def load_data(test: bool = False) -> Dict[str, pd.DataFrame]: dict_df[stats] = pd.DataFrame.from_records(records, index="id") else: log.warning(f"[bold red]No data in {stats} table!") - names = r_columns if stats == "referrer" else columns + names = [] + if stats == "clone": + names = c_columns + elif stats == "traffic": + names = t_columns + elif stats == "referrer": + names = r_columns dict_df[stats] = pd.DataFrame(columns=names) return dict_df @@ -218,19 +225,18 @@ def user_readme(username: str, token: str = None) -> str: def make_plots( username: str, - data_dir: str, out_dir: str, csv_file: str, symlink: bool = False, token: str = "", include_repos: str = "", exclude_repos: str = "", + test: bool = False, ): """ Generate HTML pages containing Bokeh plots :param username: GitHub username or organization - :param data_dir: Path to working folder. CSVs are under a 'data' sub-folder :param out_dir: Location of outputted HTML :param csv_file: CSV file containing user or organization repository list :param symlink: Symbolic link styles assets instead of copy. Default: copy @@ -239,6 +245,7 @@ def make_plots( Ignore csv_file inputs. Comma separated for multiples :param exclude_repos: Repositories to exclude from csv_file list. Comma separated for more than one + :param test: For CI testing """ if include_repos and exclude_repos: @@ -251,7 +258,7 @@ def make_plots( (~repository_df["fork"]) & (~repository_df["archived"]) ] - dict_df = load_data() + dict_df = load_data(test=test) # Add repo folder for all static repo pages p_repos = Path(out_dir) / "repos" @@ -261,7 +268,7 @@ def make_plots( # Get unique repository names repo_names0 = set() for key, df in dict_df.items(): - repo_names0.update(set(df[columns[0]].unique())) + repo_names0.update(set(df["repository_name"].unique())) repo_names = set(repository_df["name"]) & repo_names0 @@ -334,8 +341,8 @@ def make_plots( f"If you renamed it, you will need to update data/ contents" ) else: - r_traffic_df = traffic_df.loc[traffic_df[columns[0]] == r] - r_clone_df = clone_df.loc[clone_df[columns[0]] == r] + r_traffic_df = traffic_df.loc[traffic_df["repository_name"] == r] + r_clone_df = clone_df.loc[clone_df["repository_name"] == r] date_range = get_date_range([r_traffic_df, r_clone_df]) @@ -344,7 +351,7 @@ def make_plots( # Plot traffic data s1a = date_subplots( r_traffic_df, - "total", + "views", date_range, "Total Daily Traffic", **subplots_dict, @@ -395,7 +402,7 @@ def make_plots( jinja_dict = { "username": username, "title": f"GitHub Statistics for {r}", - "Total_Views": r_traffic_df["total"].sum(), + "Total_Views": r_traffic_df["views"].sum(), "Total_Clones": r_clone_df["total"].sum(), "script": script, "div": div, diff --git a/tests/test_stats_plots.py b/tests/test_stats_plots.py index 0bf8f84..15e8aa0 100644 --- a/tests/test_stats_plots.py +++ b/tests/test_stats_plots.py @@ -33,9 +33,9 @@ def html_check(input_list: list, exists=True): d0 = { "username": username, "token": token, - "data_dir": tests_data_folder, "out_dir": tests_data_folder, "csv_file": tests_data_folder / "repository.csv", + "test": True, } html_list = [ "index.html", diff --git a/tests_data/sqlite3.db b/tests_data/sqlite3.db new file mode 100644 index 0000000000000000000000000000000000000000..675edc53a3b72b230f0529e22be858eef6f1240f GIT binary patch literal 24576 zcmeI(Pfyce90%}r?HJvp^e7GkoHRk#37?m)<3g5{0nS z&<_C!KmY;|fB*y_0D%iCaIhAQCD+&4mr+OGY8dujgZw?cWwrNh!>U+DOK%#LYP->C zwkj=SzjF4XvBi>BC~IAtTtAs)4>kAHUESHS-KSo^TrS)zYI4eLOUcu5VbiQ_3M;E+ zZAmN5lD9S8F*b##h0@}3p(L+9EX%7;ibdLE8M|%UblTR73LR&^i1yLz^&1Axa!jXj zQajM~+%)#>{x2Ws>a@(g4mqtiu+x2yN-GZvr49MMwlPfyKNFwnF7dJC==_T(q(&T1MKF%%2g+J2CLjt6R+KlXQlN?{|WU%Cvl$Rtmvh!3FdD^Uz zB;>aJ;m+TkKjx1gNa0T3a0cJ;`GlJ#1EiN4naQ2Fkv;xty|5Mkg5kv7dL)*dnqm(p z-NcqrH!RC+ZTDYr1Ihc%!|8oO4onBRe@D-tFP+nN^v1~9`^%~A7FhSAd7!&NtBrQc zaGxRr0o|s53wQn*CdmDn5UY&zP5LOkk)BDKG$sBNKa20hSELyS1Rwwb2tWV=5P$## zAOHafK;Wzbsl*tQy(S3TrnA%8O52X^*y&w;+pu$)TsED_r!(_voYt^xK+Rl~Q-#o~ z8{?ch9$J-8aB3{HD$8-|Sa4M?ox6D@qDDij7Gz%KL#yWbh{}alWv@n+h+E}rO2(Dp zQYiugg zBcOQu0DRZA)4c+m;%x&6Dw>;Byj=jkMRetUMDaEO1Qjh@SG+v{K}B<%;%xy4D#|7l zZwEk7QASX_4FCZ}*;`!7UH>1rv_G%7%m3M+3*~sM|39A_8CQh>1Rwwb2tWV=5P$## LAOHafTqJ?taDGM^ literal 0 HcmV?d00001 From 53f3e65ebba227d105cc48b93b7fd2550f5c7a64 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 3 Jul 2022 14:56:52 -0700 Subject: [PATCH 07/22] Change to editable pip install in python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f282aa9..93a6fbe 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -24,7 +24,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install github_stats_pages run: | - pip install .[test] + pip install -e .[test] - name: Test with pytest run: | echo "Username for unit tests : ${{ github.actor }}" From 312c01738bda924d2f0d6d5c063276dbdab10dfd Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 4 Jul 2022 14:59:45 -0700 Subject: [PATCH 08/22] Add unit tests for db module - Fix typo with handling Paths data - Additional pragma no cover - Add test_engine fixture - Use test_engine fixture in stats_plot.load_data --- conftest.py | 7 +++ github_stats_pages/db.py | 6 +-- github_stats_pages/stats_plots.py | 11 +++-- tests/test_db.py | 33 +++++++++++++ tests/test_stats_plots.py | 4 +- tests_data/data/merged_paths.csv | 78 +++++++++++++++++++++++++++++++ 6 files changed, 130 insertions(+), 9 deletions(-) create mode 100644 tests/test_db.py create mode 100644 tests_data/data/merged_paths.csv diff --git a/conftest.py b/conftest.py index 2052cdc..3bd7a13 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,7 @@ import pytest +from github_stats_pages import db + def pytest_addoption(parser): parser.addoption("--username", action="store", default="GitHub username") @@ -22,3 +24,8 @@ def token(request): if name_value is None: pytest.skip() return name_value + + +@pytest.fixture(scope="session") +def test_engine(): + return db.create_db_and_tables(test=True) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 377aeb9..7f9331c 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -17,7 +17,7 @@ def configure(test: bool = False, echo: bool = False) -> Engine: sqlite_file_name = ( Path("tests_data/sqlite3.db") if test else SQLITE_FILE_NAME ) - if not sqlite_file_name.parent.exists(): + if not sqlite_file_name.parent.exists(): # pragma: no cover sqlite_file_name.parent.mkdir() sqlite_url = f"sqlite:///{sqlite_file_name}" log.info(f"Configuring SQLite at: {sqlite_url}") @@ -46,7 +46,7 @@ def migrate_csv( ) log.info(f"[yellow]Loading: {filename}") df = pd.read_csv(filename, header=None, skiprows=skip_rows, names=names) - if isinstance(model, Paths): + if model.__name__ == "Paths": repository_names = [a.split("/")[2] for a in df["path"].values] df.insert(1, "repository_name", repository_names) @@ -64,7 +64,7 @@ def migrate_csv( new_df.to_sql( model.__name__.lower(), engine, if_exists="append", index=False ) - if len(new_df) < len(df): + if len(new_df) < len(df): # pragma: no cover log.info("[orange]Some records exists in db") diff --git a/github_stats_pages/stats_plots.py b/github_stats_pages/stats_plots.py index 5c344ad..7a97e5d 100644 --- a/github_stats_pages/stats_plots.py +++ b/github_stats_pages/stats_plots.py @@ -32,14 +32,17 @@ main_p = Path(__file__).parent -def load_data(test: bool = False) -> Dict[str, pd.DataFrame]: +def load_data( + test: bool = False, engine: Optional[db.Engine] = None +) -> Dict[str, pd.DataFrame]: """ Load stats CSV as dict of pandas DataFrame :return: Dict of pandas DataFrame """ - engine = db.create_db_and_tables(test=test) + if not engine: + engine = db.create_db_and_tables(test=test) dict_df = {} @@ -78,8 +81,8 @@ def get_date_range(df_list: List[pd.DataFrame]) -> Optional[Tuple[dt, dt]]: if len(x_min) > 0: return min(x_min) - td(days=1), max(x_max) + td(days=1) - else: - return None + else: # pragma: no cover + return def date_subplots( diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000..6e8c621 --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,33 @@ +from github_stats_pages import db + + +def test_migrate_csv(test_engine): + # This CSV is already present so no new records exist + db.migrate_csv("tests_data/data/merged_clone.csv", db.Clone, test_engine) + + # This will add new records and ensure Paths testing + db.migrate_csv("tests_data/data/merged_paths.csv", db.Paths, test_engine) + + +def test_query(test_engine): + + t_query = db.query( + "github-stats-pages", "2021-02-28", test_engine, db.Clone + ) + assert isinstance(t_query, db.Clone) + + t_query = db.query( + "github-stats-pages", "2021-02-28", test_engine, db.Traffic + ) + assert isinstance(t_query, db.Traffic) + + # This returns a None result + assert not db.query( + "github-stats-pages", "2020-01-01", test_engine, db.Clone + ) + + +def test_query_all(test_engine): + + t_query = db.query_all(test_engine, db.Clone) + assert isinstance(t_query, list) diff --git a/tests/test_stats_plots.py b/tests/test_stats_plots.py index 15e8aa0..e2a56a0 100644 --- a/tests/test_stats_plots.py +++ b/tests/test_stats_plots.py @@ -9,8 +9,8 @@ tests_data_folder = Path("tests_data") -def test_load_data(): - dict_df = stats_plots.load_data(test=True) +def test_load_data(test_engine): + dict_df = stats_plots.load_data(test=True, engine=test_engine) assert isinstance(dict_df, dict) diff --git a/tests_data/data/merged_paths.csv b/tests_data/data/merged_paths.csv new file mode 100644 index 0000000..23716a0 --- /dev/null +++ b/tests_data/data/merged_paths.csv @@ -0,0 +1,78 @@ +2021-05-13,/astrochun/Evolution-of-Galaxies,astrochun/Evolution-of-Galaxies: Research with Dr. Chun Ly. I am working with...,23,3 +2021-05-13,/astrochun/Evolution-of-Galaxies/blob/master/Analysis/emission_line_fit.py,Evolution-of-Galaxies/emission_line_fit.py at master · astrochun/Evolution-of...,4,2 +2021-05-13,/astrochun/Evolution-of-Galaxies/issues,Issues · astrochun/Evolution-of-Galaxies,10,3 +2021-05-13,/astrochun/Evolution-of-Galaxies/issues/55,Bug with saving fitting results · Issue #55 · astrochun/Evolution-of-Galaxies,3,2 +2021-05-13,/astrochun/Evolution-of-Galaxies/pull/56,hotfix/v0.8.2 by astrochun · Pull Request #56 · astrochun/Evolution-of-Galaxies,6,2 +2021-05-13,/astrochun/Evolution-of-Galaxies/pull/56/files,hotfix/v0.8.2 by astrochun · Pull Request #56 · astrochun/Evolution-of-Galaxies,4,2 +2021-05-13,/astrochun/Evolution-of-Galaxies/pull/57,Feature/53 windows uname by astrochun · Pull Request #57 · astrochun/Evolutio...,3,2 +2021-05-13,/astrochun/Evolution-of-Galaxies/pulls,Pull requests · astrochun/Evolution-of-Galaxies,19,3 +2021-05-13,/astrochun/Evolution-of-Galaxies/tree/develop,astrochun/Evolution-of-Galaxies at develop,4,2 +2021-05-13,/astrochun/Evolution-of-Galaxies/tree/master/Analysis,Evolution-of-Galaxies/Analysis at master · astrochun/Evolution-of-Galaxies,5,3 +2021-05-13,/astrochun/GNIRSLongSlit,GitHub - astrochun/GNIRSLongSlit: Python 2.7 codes to reduce Longslit data fr...,2,2 +2021-05-13,/astrochun/MMTtools,GitHub - astrochun/MMTtools: A set of Python 2.7 and 3.x codes to use with da...,2,2 +2021-05-13,/astrochun/Metallicity_Stack_Commons,astrochun/Metallicity_Stack_Commons: Set of common codes used in metallicity ...,19,3 +2021-05-13,/astrochun/Metallicity_Stack_Commons/blob/master/Metallicity_Stack_Commons/__init__.py,Metallicity_Stack_Commons/__init__.py at master · astrochun/Metallicity_Stack...,5,2 +2021-05-13,/astrochun/Metallicity_Stack_Commons/issues,Issues · astrochun/Metallicity_Stack_Commons,18,3 +2021-05-13,/astrochun/Metallicity_Stack_Commons/issues/114,Bug: Change HbHgHd_fit to have same y-axis scale · Issue #114 · astrochun/Met...,4,2 +2021-05-13,/astrochun/Metallicity_Stack_Commons/pull/115,hotfix/1.4.6 by astrochun · Pull Request #115 · astrochun/Metallicity_Stack_C...,4,2 +2021-05-13,/astrochun/Metallicity_Stack_Commons/pull/117,hotfix/v1.4.7 by Reagen · Pull Request #117 · astrochun/Metallicity_Stack_Com...,4,2 +2021-05-13,/astrochun/Metallicity_Stack_Commons/pull/118,hotfix/1.4.7 by Reagen · Pull Request #118 · astrochun/Metallicity_Stack_Comm...,11,2 +2021-05-13,/astrochun/Metallicity_Stack_Commons/pulls,Pull requests · astrochun/Metallicity_Stack_Commons,28,3 +2021-05-13,/astrochun/Metallicity_Stack_Commons/tree/master/Metallicity_Stack_Commons,Metallicity_Stack_Commons/Metallicity_Stack_Commons at master · astrochun/Met...,9,3 +2021-05-13,/astrochun/Metallicity_Stack_Commons/tree/master/Metallicity_Stack_Commons/analysis,Metallicity_Stack_Commons/Metallicity_Stack_Commons/analysis at master · astr...,4,2 +2021-05-13,/astrochun/PyMontage,GitHub - astrochun/PyMontage: Python 2.7 scripts to running the IPAC Montage ...,1,1 +2021-05-13,/astrochun/Zcalbase_gal,astrochun/Zcalbase_gal: Python 3.x codes for Metallicity Calibration Database...,18,2 +2021-05-13,/astrochun/Zcalbase_gal/blob/master/analysis/deep2_r23_o32/zoom_and_gauss_general.py,Zcalbase_gal/zoom_and_gauss_general.py at master · astrochun/Zcalbase_gal,3,1 +2021-05-13,/astrochun/Zcalbase_gal/issues,Issues · astrochun/Zcalbase_gal,13,2 +2021-05-13,/astrochun/Zcalbase_gal/issues/100,Bug: Incorrect use of normalization for fits · Issue #100 · astrochun/Zcalbas...,5,2 +2021-05-13,/astrochun/Zcalbase_gal/pull/99,hotfix/v0.13.1 by Reagen · Pull Request #99 · astrochun/Zcalbase_gal,14,2 +2021-05-13,/astrochun/Zcalbase_gal/pull/99/commits,hotfix/v0.13.1 by Reagen · Pull Request #99 · astrochun/Zcalbase_gal,4,2 +2021-05-13,/astrochun/Zcalbase_gal/pull/99/files,hotfix/v0.13.1 by Reagen · Pull Request #99 · astrochun/Zcalbase_gal,9,1 +2021-05-13,/astrochun/Zcalbase_gal/pulls,Pull requests · astrochun/Zcalbase_gal,21,2 +2021-05-13,/astrochun/Zcalbase_gal/tree/master/analysis,Zcalbase_gal/analysis at master · astrochun/Zcalbase_gal,4,1 +2021-05-13,/astrochun/Zcalbase_gal/tree/master/analysis/deep2_r23_o32,Zcalbase_gal/analysis/deep2_r23_o32 at master · astrochun/Zcalbase_gal,4,1 +2021-05-13,/astrochun/academic-ads-bibtex/blob/main/setup.py,academic-ads-bibtex/setup.py at main · astrochun/academic-ads-bibtex,1,1 +2021-05-13,/astrochun/figshare_autosync_check,astrochun/figshare_autosync_check,6,1 +2021-05-13,/astrochun/figshare_autosync_check/actions,Actions · astrochun/figshare_autosync_check,3,1 +2021-05-13,/astrochun/figshare_autosync_check/actions/runs/415969321,Add data.csv 2020-08-15 data · astrochun/figshare_autosync_check@1ecfbfa,2,1 +2021-05-13,/astrochun/figshare_autosync_check/actions/workflows/create_release.yml,Actions · astrochun/figshare_autosync_check,3,1 +2021-05-13,/astrochun/figshare_autosync_check/blob/main/.github/workflows/create_release.yml,figshare_autosync_check/create_release.yml at main · astrochun/figshare_autos...,4,1 +2021-05-13,/astrochun/figshare_autosync_check/commit/9829445961cfeef4b203925a3d5dcb073341863d,Add .git dot files · astrochun/figshare_autosync_check@9829445,1,1 +2021-05-13,/astrochun/figshare_autosync_check/commits/main,Commits · astrochun/figshare_autosync_check,1,1 +2021-05-13,/astrochun/figshare_autosync_check/tree/main/.github/workflows,figshare_autosync_check/.github/workflows at main · astrochun/figshare_autosy...,5,1 +2021-05-13,/astrochun/github-stats,astrochun/github-stats: My GitHub stats,14,4 +2021-05-13,/astrochun/github-stats-pages,astrochun/github-stats-pages: Retrieve statistics for a user's repositories a...,37,2 +2021-05-13,/astrochun/github-stats-pages/actions,Actions · astrochun/github-stats-pages,17,1 +2021-05-13,/astrochun/github-stats-pages/graphs/traffic,Traffic · astrochun/github-stats-pages,5,1 +2021-05-13,/astrochun/github-stats-pages/issues,Issues · astrochun/github-stats-pages,24,2 +2021-05-13,/astrochun/github-stats-pages/issues/52,Add popular content · Issue #52 · astrochun/github-stats-pages,10,2 +2021-05-13,/astrochun/github-stats-pages/pull/55,Feature: Add GitHub repo description on repo pages by astrochun · Pull Reques...,8,1 +2021-05-13,/astrochun/github-stats-pages/pull/56,Switch over to use PyGitHub for more capabilities by astrochun · Pull Request...,8,1 +2021-05-13,/astrochun/github-stats-pages/pulls,Pull requests · astrochun/github-stats-pages,20,1 +2021-05-13,/astrochun/github-stats-pages/pulse,Pulse · astrochun/github-stats-pages,6,2 +2021-05-13,/astrochun/github-stats-pages/releases,Releases · astrochun/github-stats-pages,26,2 +2021-05-13,/astrochun/github-stats/blob/gh-pages/repos/Extract1D.html,github-stats/Extract1D.html at gh-pages · astrochun/github-stats,1,1 +2021-05-13,/astrochun/github-stats/blob/gh-pages/repositories.html,github-stats/repositories.html at gh-pages · astrochun/github-stats,1,1 +2021-05-13,/astrochun/github-stats/blob/main/.github/workflows/gh-pages-deploy.yml,github-stats/gh-pages-deploy.yml at main · astrochun/github-stats,1,1 +2021-05-13,/astrochun/github-stats/network,Network Graph · astrochun/github-stats,1,1 +2021-05-13,/astrochun/github-stats/network/dependencies,Dependencies · astrochun/github-stats,1,1 +2021-05-13,/astrochun/github-stats/tree/gh-pages,astrochun/github-stats at gh-pages,4,2 +2021-05-13,/astrochun/github-stats/tree/gh-pages/repos,github-stats/repos at gh-pages · astrochun/github-stats,2,1 +2021-05-13,/astrochun/github-stats/tree/gh-pages/styles,github-stats/styles at gh-pages · astrochun/github-stats,2,1 +2021-05-13,/astrochun/github-stats/tree/main/.github/workflows,github-stats/.github/workflows at main · astrochun/github-stats,3,1 +2021-05-13,/astrochun/site-hugo-academic,astrochun/site-hugo-academic,2,2 +2021-05-13,/astrochun/test-github-stats,astrochun/test-github-stats: Testing of GitHub action for GitHub pages deploy...,2,1 +2021-05-13,/astrochun/test-github-stats/blob/main/data/2021-01-17-00h-46m-clone-stats.csv,test-github-stats/2021-01-17-00h-46m-clone-stats.csv at main · astrochun/test...,1,1 +2021-05-13,/astrochun/test-github-stats/blob/main/data/2021-01-17-00h-46m-referrer-stats.csv,test-github-stats/2021-01-17-00h-46m-referrer-stats.csv at main · astrochun/t...,3,1 +2021-05-13,/astrochun/test-github-stats/blob/main/data/2021-01-17-00h-46m-traffic-stats.csv,test-github-stats/2021-01-17-00h-46m-traffic-stats.csv at main · astrochun/te...,2,1 +2021-05-13,/astrochun/test-github-stats/blob/main/data/2021-01-17-00h-52m-clone-stats.csv,test-github-stats/2021-01-17-00h-52m-clone-stats.csv at main · astrochun/test...,1,1 +2021-05-13,/astrochun/test-github-stats/blob/main/data/2021-02-20-08h-55m-referrer-stats.csv,test-github-stats/2021-02-20-08h-55m-referrer-stats.csv at main · astrochun/t...,1,1 +2021-05-13,/astrochun/test-github-stats/blob/main/data/2021-02-26-22h-11m-traffic-stats.csv,test-github-stats/2021-02-26-22h-11m-traffic-stats.csv at main · astrochun/te...,1,1 +2021-05-13,/astrochun/test-github-stats/commit/27a681b3ce456710f7db46606566f000e144d6f4,Update data: 2021-05-11 · astrochun/test-github-stats@27a681b,1,1 +2021-05-13,/astrochun/test-github-stats/commit/de2412de46852a7acdfad4db7a10bbd956ef8448,Update data: 2021-05-12 · astrochun/test-github-stats@de2412d,2,1 +2021-05-13,/astrochun/test-github-stats/tree/main/data,test-github-stats/data at main · astrochun/test-github-stats,3,1 +2021-05-13,/astrochun/test-stats,astrochun/test-stats,2,1 +2021-05-13,/astrochun/test-stats/actions,Actions · astrochun/test-stats,1,1 +2021-05-13,/astrochun/test-stats/tree/gh-pages,astrochun/test-stats at gh-pages,1,1 +2021-05-13,/astrochun/test-stats/tree/gh-pages/repos,test-stats/repos at gh-pages · astrochun/test-stats,1,1 +2021-05-13,/astrochun/voxcharta-my-voting-record,GitHub - astrochun/voxcharta-my-voting-record: A Python tool to extract infor...,1,1 From f57a3512b6c0839d178a863fc559fcfc57ef1901 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 11 Jul 2022 20:20:21 -0700 Subject: [PATCH 09/22] db: Add query_path function to handle Paths data query --- github_stats_pages/db.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 7f9331c..8e9f4ae 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -50,9 +50,15 @@ def migrate_csv( repository_names = [a.split("/")[2] for a in df["path"].values] df.insert(1, "repository_name", repository_names) - func = partial(query, engine=engine, model=model) + if model.__name__ == "Paths": + func = partial(query_path, engine=engine, model=model) + query_results = list( + map(func, df["repository_name"], df["date"], df["path"]) + ) + else: + func = partial(query, engine=engine, model=model) + query_results = list(map(func, df["repository_name"], df["date"])) - query_results = list(map(func, df["repository_name"], df["date"])) new_df: pd.DataFrame = df.iloc[ [idx for idx, item in enumerate(query_results) if not item] ] @@ -96,3 +102,25 @@ def query_all( with Session(engine) as session: result = session.exec(select(model)) return result.all() + + +def query_path( + repository_name: str, + date: str, + path: str, + engine: Engine, + model: Union[Type[SQLModel], Paths], +) -> Union[SQLModel, Paths, None]: + + with Session(engine) as session: + result = session.exec( + select(model).where( + model.repository_name == repository_name, + model.date == date, + model.path == path, + ) + ) + try: + return result.one() + except NoResultFound: + return From f5074171b88a03821d0ef28d1706a9177368a3cd Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 12 Jul 2022 18:41:07 -0700 Subject: [PATCH 10/22] Add additional verbose messaging in scripts --- github_stats_pages/db.py | 1 + scripts/gts_run_all_repos | 4 ++-- scripts/make_stats_plots | 4 +++- scripts/migrate_to_sqlite | 2 ++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 8e9f4ae..a4e2692 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -46,6 +46,7 @@ def migrate_csv( ) log.info(f"[yellow]Loading: {filename}") df = pd.read_csv(filename, header=None, skiprows=skip_rows, names=names) + log.info(f"Size of dataframe: {len(df)}") if model.__name__ == "Paths": repository_names = [a.split("/")[2] for a in df["path"].values] df.insert(1, "repository_name", repository_names) diff --git a/scripts/gts_run_all_repos b/scripts/gts_run_all_repos index 5a9be20..be5e5fb 100755 --- a/scripts/gts_run_all_repos +++ b/scripts/gts_run_all_repos @@ -29,7 +29,7 @@ if __name__ == "__main__": ) args = parser.parse_args() - log.info("[yellow]Running gts_run_all_repos script") + log.info("[bold yellow]Running gts_run_all_repos script") df = read_csv(args.csv_file) @@ -63,4 +63,4 @@ if __name__ == "__main__": for f in p_cwd.glob(f"????-??-??-???-???-*{datatype}-stats.csv"): db.migrate_csv(f, model, engine, skip_rows=1) - log.info("[dark_green]gts_run_all_repos script completed!") + log.info("[bold dark_green]gts_run_all_repos script completed!") diff --git a/scripts/make_stats_plots b/scripts/make_stats_plots index 81f8b4d..0649fad 100755 --- a/scripts/make_stats_plots +++ b/scripts/make_stats_plots @@ -46,7 +46,7 @@ if __name__ == "__main__": args = parser.parse_args() vargs = vars(args) - log.info("[yellow]Running make_stats_plots script") + log.info("[bold yellow]Running make_stats_plots script") if args.include_repos and args.exclude_repos: msg = "Cannot provide include_repos and exclude_repos simultaneously!" @@ -54,3 +54,5 @@ if __name__ == "__main__": raise ValueError(msg) stats_plots.make_plots(**vargs) + + log.info("[bold dark_green]make_stats_plots script completed!") diff --git a/scripts/migrate_to_sqlite b/scripts/migrate_to_sqlite index 0403a30..c2dc283 100755 --- a/scripts/migrate_to_sqlite +++ b/scripts/migrate_to_sqlite @@ -45,3 +45,5 @@ if __name__ == "__main__": db.migrate_csv(file, model=model, engine=engine) else: log.info("No merged files to migrate!") + + log.info("[bold dark_green]migrate_to_sqlite script completed!") From 29f8caa4d5348cb1bfa7315c64a92d80b77f93cf Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Wed, 20 Jul 2022 20:50:20 -0700 Subject: [PATCH 11/22] Update gts_run_all_repos to include paths and referrer files - Add db.query_referring function - Adjust db.migrate_csv to handle Referring data --- github_stats_pages/db.py | 33 +++++++++++++++++++++++++- github_stats_pages/models/referring.py | 2 +- scripts/gts_run_all_repos | 4 +++- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index a4e2692..48220ca 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -47,6 +47,10 @@ def migrate_csv( log.info(f"[yellow]Loading: {filename}") df = pd.read_csv(filename, header=None, skiprows=skip_rows, names=names) log.info(f"Size of dataframe: {len(df)}") + if model.__name__ == "Referring": # Add date since this isn't included + file_date = filename.name[:10] + df.insert(loc=0, column="date", value=file_date) + if model.__name__ == "Paths": repository_names = [a.split("/")[2] for a in df["path"].values] df.insert(1, "repository_name", repository_names) @@ -56,7 +60,12 @@ def migrate_csv( query_results = list( map(func, df["repository_name"], df["date"], df["path"]) ) - else: + elif model.__name__ == "Referring": + func = partial(query_referring, engine=engine, model=model) + query_results = list( + map(func, df["repository_name"], df["date"], df["site"]) + ) + else: # For Clone and Traffic func = partial(query, engine=engine, model=model) query_results = list(map(func, df["repository_name"], df["date"])) @@ -125,3 +134,25 @@ def query_path( return result.one() except NoResultFound: return + + +def query_referring( + repository_name: str, + date: str, + site: str, + engine: Engine, + model: Union[Type[SQLModel], Referring], +) -> Union[SQLModel, Referring, None]: + + with Session(engine) as session: + result = session.exec( + select(model).where( + model.repository_name == repository_name, + model.date == date, + model.site == site, + ) + ) + try: + return result.one() + except NoResultFound: + return diff --git a/github_stats_pages/models/referring.py b/github_stats_pages/models/referring.py index 750a280..b089774 100644 --- a/github_stats_pages/models/referring.py +++ b/github_stats_pages/models/referring.py @@ -7,6 +7,6 @@ class Referring(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) repository_name: str site: str - date: str + date: Optional[str] total: int unique: int diff --git a/scripts/gts_run_all_repos b/scripts/gts_run_all_repos index be5e5fb..943e04b 100755 --- a/scripts/gts_run_all_repos +++ b/scripts/gts_run_all_repos @@ -59,7 +59,9 @@ if __name__ == "__main__": p_cwd = Path.cwd() p_data = p_cwd / "data" - for datatype, model in zip(["clone", "traffic"], [db.Clone, db.Traffic]): + model_names = ["clone", "traffic", "paths", "referrer"] + models = [db.Clone, db.Traffic, db.Paths, db.Referring] + for datatype, model in zip(model_names, models): for f in p_cwd.glob(f"????-??-??-???-???-*{datatype}-stats.csv"): db.migrate_csv(f, model, engine, skip_rows=1) From fa537adaff5f5f12950d29846deefd2e42da7a96 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 24 Jul 2022 12:27:31 -0700 Subject: [PATCH 12/22] migrate_csv: Sort by repository name and date --- github_stats_pages/db.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 48220ca..a8c30c4 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -55,6 +55,8 @@ def migrate_csv( repository_names = [a.split("/")[2] for a in df["path"].values] df.insert(1, "repository_name", repository_names) + df.sort_values(["repository_name", "date"], inplace=True) + if model.__name__ == "Paths": func = partial(query_path, engine=engine, model=model) query_results = list( From e8613dc5ecf70b894fc53c00e92d6e149001a4b2 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 31 Jul 2022 15:21:33 -0700 Subject: [PATCH 13/22] Fix a few things for consistency - Model fields consistency for clone, referrer - Change Referring to Referrer - db.migrate_csv: Simplify and handle merged referrer file - Add column names in merged_paths.csv - Fix sqlite3 test db --- github_stats_pages/db.py | 43 ++++++++---------- github_stats_pages/models/__init__.py | 2 +- github_stats_pages/models/clone.py | 2 +- .../models/{referring.py => referrer.py} | 4 +- scripts/gts_run_all_repos | 4 +- scripts/migrate_to_sqlite | 4 +- tests/test_db.py | 8 +++- tests_data/data/merged_paths.csv | 1 + tests_data/sqlite3.db | Bin 24576 -> 32768 bytes 9 files changed, 35 insertions(+), 33 deletions(-) rename github_stats_pages/models/{referring.py => referrer.py} (79%) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index a8c30c4..29ec60e 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -7,8 +7,9 @@ from sqlalchemy.exc import NoResultFound from sqlmodel import SQLModel, Session, create_engine, select -from .models import Clone, Referring, Traffic, Paths +from .models import Clone, Referrer, Traffic, Paths from .logger import app_log as log +from . import STATS_SORT_DATAFRAME SQLITE_FILE_NAME = Path("data/sqlite3.db") @@ -31,39 +32,35 @@ def create_db_and_tables(test: bool = False, echo: bool = False): def migrate_csv( - filename: Union[str, Path], + filename: Path, model: Type[SQLModel], engine: Engine, - skip_rows: Union[int, None] = None, ): """Migrate CSV over to SQLite""" - names = list( - map( - lambda f: f.name, - filter(lambda x: x.required, model.__fields__.values()), - ) - ) log.info(f"[yellow]Loading: {filename}") - df = pd.read_csv(filename, header=None, skiprows=skip_rows, names=names) + df = pd.read_csv(filename) log.info(f"Size of dataframe: {len(df)}") - if model.__name__ == "Referring": # Add date since this isn't included - file_date = filename.name[:10] - df.insert(loc=0, column="date", value=file_date) + if "merge" not in filename.name: + if model.__name__ == "Referrer": # Add date since this isn't included + file_date = filename.name[:10] + df.insert(loc=0, column="date", value=file_date) if model.__name__ == "Paths": repository_names = [a.split("/")[2] for a in df["path"].values] df.insert(1, "repository_name", repository_names) + simple_paths = ["/".join(a.split("/")[3:]) for a in df["path"].values] + df["path"] = simple_paths - df.sort_values(["repository_name", "date"], inplace=True) + df.sort_values(STATS_SORT_DATAFRAME[model.__name__.lower()], inplace=True) if model.__name__ == "Paths": func = partial(query_path, engine=engine, model=model) query_results = list( map(func, df["repository_name"], df["date"], df["path"]) ) - elif model.__name__ == "Referring": - func = partial(query_referring, engine=engine, model=model) + elif model.__name__ == "Referrer": + func = partial(query_referrer, engine=engine, model=model) query_results = list( map(func, df["repository_name"], df["date"], df["site"]) ) @@ -90,8 +87,8 @@ def query( repository_name: str, date: str, engine: Engine, - model: Union[Type[SQLModel], Clone, Referring, Paths, Traffic], -) -> Union[SQLModel, Clone, Referring, Paths, Traffic, None]: + model: Union[Type[SQLModel], Clone, Referrer, Paths, Traffic], +) -> Union[SQLModel, Clone, Referrer, Paths, Traffic, None]: with Session(engine) as session: result = session.exec( @@ -107,8 +104,8 @@ def query( def query_all( engine: Engine, - model: Union[Type[SQLModel], Clone, Referring, Paths, Traffic], -) -> List[Union[SQLModel, Clone, Referring, Paths, Traffic]]: + model: Union[Type[SQLModel], Clone, Referrer, Paths, Traffic], +) -> List[Union[SQLModel, Clone, Referrer, Paths, Traffic]]: """Retrieve an entire table""" with Session(engine) as session: @@ -138,13 +135,13 @@ def query_path( return -def query_referring( +def query_referrer( repository_name: str, date: str, site: str, engine: Engine, - model: Union[Type[SQLModel], Referring], -) -> Union[SQLModel, Referring, None]: + model: Union[Type[SQLModel], Referrer], +) -> Union[SQLModel, Referrer, None]: with Session(engine) as session: result = session.exec( diff --git a/github_stats_pages/models/__init__.py b/github_stats_pages/models/__init__.py index 054109e..2090a47 100644 --- a/github_stats_pages/models/__init__.py +++ b/github_stats_pages/models/__init__.py @@ -1,4 +1,4 @@ from .clone import Clone # noqa: F401 -from .referring import Referring # noqa: F401 +from .referrer import Referrer # noqa: F401 from .paths import Paths # noqa: F401 from .traffic import Traffic # noqa: F401 diff --git a/github_stats_pages/models/clone.py b/github_stats_pages/models/clone.py index be1a9e5..1585695 100644 --- a/github_stats_pages/models/clone.py +++ b/github_stats_pages/models/clone.py @@ -7,5 +7,5 @@ class Clone(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) repository_name: str date: str - total: int + clones: int unique: int diff --git a/github_stats_pages/models/referring.py b/github_stats_pages/models/referrer.py similarity index 79% rename from github_stats_pages/models/referring.py rename to github_stats_pages/models/referrer.py index b089774..96c2107 100644 --- a/github_stats_pages/models/referring.py +++ b/github_stats_pages/models/referrer.py @@ -3,10 +3,10 @@ from sqlmodel import SQLModel, Field -class Referring(SQLModel, table=True): +class Referrer(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) repository_name: str site: str date: Optional[str] - total: int + views: int unique: int diff --git a/scripts/gts_run_all_repos b/scripts/gts_run_all_repos index 943e04b..c8ec4df 100755 --- a/scripts/gts_run_all_repos +++ b/scripts/gts_run_all_repos @@ -60,9 +60,9 @@ if __name__ == "__main__": p_data = p_cwd / "data" model_names = ["clone", "traffic", "paths", "referrer"] - models = [db.Clone, db.Traffic, db.Paths, db.Referring] + models = [db.Clone, db.Traffic, db.Paths, db.Referrer] for datatype, model in zip(model_names, models): for f in p_cwd.glob(f"????-??-??-???-???-*{datatype}-stats.csv"): - db.migrate_csv(f, model, engine, skip_rows=1) + db.migrate_csv(f, model, engine) log.info("[bold dark_green]gts_run_all_repos script completed!") diff --git a/scripts/migrate_to_sqlite b/scripts/migrate_to_sqlite index c2dc283..efde911 100755 --- a/scripts/migrate_to_sqlite +++ b/scripts/migrate_to_sqlite @@ -5,7 +5,7 @@ import pandas as pd from github_stats_pages import db from github_stats_pages.logger import app_log as log -from github_stats_pages.models import Clone, Traffic, Referring, Paths +from github_stats_pages.models import Clone, Traffic, Referrer, Paths DROP_DUPLICATES_SUBSET = ["date", "repository_name", "site"] @@ -40,7 +40,7 @@ if __name__ == "__main__": merged_files = [x for x in sorted(p_data.glob("merged_*.csv"))] if merged_files: - model_list = [Clone, Paths, Referring, Traffic] + model_list = [Clone, Paths, Referrer, Traffic] for file, model in zip(merged_files, model_list): db.migrate_csv(file, model=model, engine=engine) else: diff --git a/tests/test_db.py b/tests/test_db.py index 6e8c621..ab54525 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -1,12 +1,16 @@ +from pathlib import Path + from github_stats_pages import db +t_data = Path("tests_data/data/") + def test_migrate_csv(test_engine): # This CSV is already present so no new records exist - db.migrate_csv("tests_data/data/merged_clone.csv", db.Clone, test_engine) + db.migrate_csv(t_data / "merged_clone.csv", db.Clone, test_engine) # This will add new records and ensure Paths testing - db.migrate_csv("tests_data/data/merged_paths.csv", db.Paths, test_engine) + db.migrate_csv(t_data / "merged_paths.csv", db.Paths, test_engine) def test_query(test_engine): diff --git a/tests_data/data/merged_paths.csv b/tests_data/data/merged_paths.csv index 23716a0..74ff2cf 100644 --- a/tests_data/data/merged_paths.csv +++ b/tests_data/data/merged_paths.csv @@ -1,3 +1,4 @@ +date,path,title,views,unique 2021-05-13,/astrochun/Evolution-of-Galaxies,astrochun/Evolution-of-Galaxies: Research with Dr. Chun Ly. I am working with...,23,3 2021-05-13,/astrochun/Evolution-of-Galaxies/blob/master/Analysis/emission_line_fit.py,Evolution-of-Galaxies/emission_line_fit.py at master · astrochun/Evolution-of...,4,2 2021-05-13,/astrochun/Evolution-of-Galaxies/issues,Issues · astrochun/Evolution-of-Galaxies,10,3 diff --git a/tests_data/sqlite3.db b/tests_data/sqlite3.db index 675edc53a3b72b230f0529e22be858eef6f1240f..b4540e689f36f28b1e73b3adea7badc952a22ba4 100644 GIT binary patch literal 32768 zcmeHPZ)_W98Mn`N5<9WIw(GJkE4|if(?2`g=f6n`tVx9%g#%=vtdkJvtQ zcTVfF4c&&|0}_ZKKJgL6CLtt*5MvS_015FK&^{pG6W{p+3^Zx*Ja^~#&b~XlrPCzz zo*LV|_j#W`zvp?M_ut)n^Vv;N73iuYH#n8f`X2H5{k|(S?eqB}@HYW}-75$`@UMIM zyFWa>BfjK~SEE1>oce(;emV9^v;{xNMGzne5CjMU1Ob8oL4Y7Y5FiK;1Y8l&)`H{H zGc*1-1eGh-1y$y%RgpKpLMuC~OS`M|?$Yw+DsA4R&qu;yh2FTnySlcz^AsHk%R)<1 zL{*YIr6$)9=;xMpR@Rqx=<7Fj>Fc*PH<6~o!Cc6x;6N1aC|z#0DreeF@r!NYjAP;U z&c@c#&TabI>TUYGSh*0npzR07r>Cd=Z{~GNTAW%_^pBv;3jNlg4X*mR7>70?D@U7D zQLTU6*1kxMPoF&Le{)>7Ru-y)EaN}pJ7BZ9dw0-m7wc?(0M|MjW5o>dczCW{_Uo>E zgV&AY>rzwDKPa17`mG1E$*Q`|Se@rXiAS89PsVs({Lk@s2P>YRBP=rqo|y!FfDFR zP%`NZ%cQeRx-c6-8o%FzCLapShKEw+CIYh)L#eWpf!Xn)RBRwHJLXN5VKVcNQnR6< zR7E;C8yrei2vW0wp;YW+p#-Hr2}XDNaPNDhGEk<|EJ>L_r-q)ALJqk5CjMU z1Ob8oL4Y7Y5FiK;1PB5I0fGQQ;1fgO5;g8)PdZNITb-7el;pkcN=hcn_<{JeBeAi# zlH{dEk30lcQSpDGZ4>bijUS(wp+qo2fFM8+AP5iy2m%BFf&f8)AV3fx2oMAa0uL8~ z@nzsQM&tf}B=${T{7>-w|6B1!d_A6seH8m+?5A*(Tm%7v06~BtKoB4Z5CjMU1Ob8o zL4Y7Y5cqfyNEq({xIWj8?+OGG#%lmxMER!@##;ccMEK-Bl`vic@FFUnOBn9}coF3T z3F8$2FCum_VY~t0MU)OFj28erh}ea|8U6Wx&qw>S3H|YZzZauS5a<82@0u_EmDpcm zZ$>|gcBcL|Rhycae0y>x@^)lC{PS>V;(HU1jemXIH`X4DhN{7z1=j=b2QE>6r7Hfv z`(K6U4c-F=K3|=l@SQ%J|FS=9JYJAz*zAiuS1)slP}<|_sw@bp2B)ZkoZ^~Xy`zYV zwfyiV4JQ#Abo8&^rJ;f>@wIj{WhD-7o(pv3$UD<{YS{C4Z4nNH zH=4WzXN2%7-Btuzs?tq7ow6?8<5W><;#rk+72rT*H?St1fSQgtCzHt#^~@6(DJQnP zrR$tLVkAmwb0;FeNDFDF-J+yim&*Mio~j5!D^rp)*^-par0z*lqr^2U&?4H3Qi7-v zWUiiUb*$0tPBlu~2z>Ez1U#xSZYDdosYQCbqt*c7Z1OJ6OBF%UvEG71cj|SK7u62E z0&XbFh99nSaPA4TZ2+LelSQ_(T*`d@-vPdhu zEVjV4szl4}W>akL(P~YgH@25nXpNgyqS;KWDX+5Bk2OEJsUW429qZ#ZQ|$ys8Z~v#u0QW zj-ZPXBojv-K$W}&l>(x2*tSuwy*Fan{7_{KRU(g|Dt#&28AfPd-Tckw^!&Y2cOSEDL zGCLbg`_La+Ivt1>?D+4%`@(dE19223PlC9)K5=W;H+F7rO3l5Sby0PYX)CvNGlp^4 z$q1D;FVM(D0VW_$<>;!c%hbXV5L(P@32Wr79!T{zjzbu!%B*p9h-DaDS~{2|b7<>G znCh0gdE2p8C{PMR8z`RTM$@Ds90+x()$_U|S5(2I3{tO5pn&PdPyxfzzN4sP0WC0& zV9wZ6z9X@o?(@ZG1Ws+sLMoqyNN83hIDfCLAKd4isxB&G+07LIYX+#yAa=RnvNI;vV(7e(J z?zZOiZW~Pr4X_%TdZ{ip1?CH}(-k`Y#eMge_(OH;VLH2Y2=l^GXzw*V^*gsrDM`d4hGFvX{gH3ODmi%dFQV{(NC(>;FDpilD3 z!2pxdsu_NVP7}s9J%`~4JD{CgLSG;Lo&JuxJ` zK($!ckM|d;wr+`2w)T&DzkV-8-4XJ>Qu3A$89_-1~+Itei z@mvL(B;OM|SVmFLeHM9~eA&vw8VSbHBeGBzVD75ynBR714bneBZGQ&&PCxI>7jm^C zY~#l;j$_pGpVm3I64g?to>Nxb; z?yMOzc3f*|NU}ilT#GA^+U%5N>8{8L)g+gIH#ErWtv62QU9Fz#)tLQ_TW={*gd$Y+MRGD7XlqbpC` z9z~#22R7)O$Eg>K$n(NW-r=z)bFG?^GLJppHBLqzrf}$H0eR01;cYZnWl3+heQRTb z8XvinN7(!Nu|c|C^BpY19!!-*7{AhZPLK#Ggf~{9I1EgAPSL|aKgSs(5oZTysW0U~ z@Yxmcw}#HucKM|AgOhE-^gAQUy1}BXxo6#i+Gd%vSFMtQ)C*}H zs@vAi&#GNhP|+{2V%id#6`{$4i?3RKyW_H%Le4WU4dHy<5J0bCqjBe^J(9$L+SiV_ z@$2=_0|AxlqR*K9TBlLB8tGI%Jj^tTL6xp)$30c*$~HWYr=0_Y$(q`yf*u9dQmdal zaG{<;C7#vd_n|~;^)4hQUS+Qi>R~S$O^LNj=<29#+4{)`HPDZS6G3sma81~@+<-|b00w5m#xbI3T< z(88xdKBUnZt2NgBKQr6dw#x1Mh*s*uohv(up%3z~H$Q7tegA(da?BUsjr}nCpXlz? zFQ+a|zBl>R$>qogk+-0LTm%7v06~BtKoB4Z5CjMU|L+LAb{$5iOR$V*9UUJ?cX^l= zs~pqlFb5K@?8A-;Jf~M#8+^9QofxpLX+wB5GmWUO-Z3XT7}FC+9QH#Sko+|~#}dQ<6Mb*qTwYshNmnlmeN(&sqW>G5Mf zhdiQr>wKVB>ZAeH+5j)5E@U=IYf2#$W3HpR{5oe%qAUr%c z+_&UDpv)DMJ*6F4GnE>^-)g9-RHn!k=Gk(V7jpSRkxf^Nm2$37NEh;jLN%RE3v4b| zDO7XLO0o8N)Syx(d+q!)sFU&_yL|9idV$GgY7EOZ7;{m7_&%R^EjI^4W796IqN=ca z>Iq+OB8*{VZiC_(^RQe{WAgd?**zUn%|%(%_1sm|^?%zu&1V`+-$v+wAN#j?9qPDf eX>6Wq7gkLFdo~nDP;Fb%Bm3XGmYcGGDfM64nvYHZ delta 572 zcmZo@U}`wPI6+#Foq>UY4TxEQm=TECC+ZlBvoq*^+`-G6$iTwAkb&QZFPLXK_d;&R zjg6YIXD*~P=d8CzIO5|eULiwkoyOHz}H^2>`;i{gt?^AdAY ziC4tt9OUX4;@VgYG+$htvDp=@wIH!1qZmv;zu0c9LZc1WFY654NW013lV~~QMe~5x#sE-dwrYN-_zc{lbzo;@EVvurSuP;R)7LR5Qo-9bQUm0AO)kyJ z%Xm(40VCiO*z>lNt9ZBaKum}S+cDXbSHnDwiCtV?p0QaP95zL%X{kj;nR)4O1~ViK zp+N)}o_vGHZ1PDy)yY!4ysRbpC5btcb$DGl9T~wP-IO@lmA?=ozIhM7hX50Uz-B>% X7yO$Q3~usIoM5@hfdwu$ae@#4QkSkZ From be1ffeb70b6881f3d279631bc78f8c275e07c4b9 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 31 Jul 2022 15:26:47 -0700 Subject: [PATCH 14/22] Minor fix for poort gts column names --- github_stats_pages/db.py | 1 + 1 file changed, 1 insertion(+) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 29ec60e..5def576 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -40,6 +40,7 @@ def migrate_csv( log.info(f"[yellow]Loading: {filename}") df = pd.read_csv(filename) + df.rename(columns={"unique_visitors/cloners": "unique"}, inplace=True) log.info(f"Size of dataframe: {len(df)}") if "merge" not in filename.name: if model.__name__ == "Referrer": # Add date since this isn't included From 674cf5735cfff7394f0afa04ed345fcd34e6dfb4 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 31 Jul 2022 15:31:36 -0700 Subject: [PATCH 15/22] Fix columns for top paths output --- github_stats_pages/__init__.py | 5 +++++ github_stats_pages/db.py | 4 ++-- scripts/merge_csv | 15 +++++++-------- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/github_stats_pages/__init__.py b/github_stats_pages/__init__.py index 0479570..72d1f7a 100644 --- a/github_stats_pages/__init__.py +++ b/github_stats_pages/__init__.py @@ -1,5 +1,10 @@ __version__ = "0.4.14" +RENAME_MAPPING = { + "count": "views", # for paths + "unique_visitors/cloners": "unique", # for clones, traffic, referrer + "uniques": "unique", # for paths +} STATS_TYPES = ["clone", "paths", "referrer", "traffic"] diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 5def576..a2d4975 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -9,7 +9,7 @@ from .models import Clone, Referrer, Traffic, Paths from .logger import app_log as log -from . import STATS_SORT_DATAFRAME +from . import RENAME_MAPPING, STATS_SORT_DATAFRAME SQLITE_FILE_NAME = Path("data/sqlite3.db") @@ -40,7 +40,7 @@ def migrate_csv( log.info(f"[yellow]Loading: {filename}") df = pd.read_csv(filename) - df.rename(columns={"unique_visitors/cloners": "unique"}, inplace=True) + df.rename(columns=RENAME_MAPPING, inplace=True) log.info(f"Size of dataframe: {len(df)}") if "merge" not in filename.name: if model.__name__ == "Referrer": # Add date since this isn't included diff --git a/scripts/merge_csv b/scripts/merge_csv index 27d4c28..3763e28 100755 --- a/scripts/merge_csv +++ b/scripts/merge_csv @@ -4,15 +4,14 @@ from pathlib import Path import pandas as pd -from github_stats_pages import STATS_TYPES, STATS_COLUMNS, STATS_SORT_DATAFRAME +from github_stats_pages import ( + RENAME_MAPPING, + STATS_TYPES, + STATS_COLUMNS, + STATS_SORT_DATAFRAME, +) from github_stats_pages.logger import app_log as log -rename_mapping = { - "count": "views", # for paths - "unique_visitors/cloners": "unique", # for clones, traffic, referrer - "uniques": "unique", # for paths -} - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -32,7 +31,7 @@ if __name__ == "__main__": log.info(f"Number of {stat} files found: {len(files)}") for file in files: df = pd.read_csv(file) - df.rename(columns=rename_mapping, inplace=True) + df.rename(columns=RENAME_MAPPING, inplace=True) if stat == "traffic": df.rename(columns={"total": "views"}, inplace=True) From d927e37f32dc14655ea4581448787f8fee7ae100 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 31 Jul 2022 19:20:31 -0700 Subject: [PATCH 16/22] Ensure that individual runs are added --- scripts/gts_run_all_repos | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/gts_run_all_repos b/scripts/gts_run_all_repos index c8ec4df..0798dcf 100755 --- a/scripts/gts_run_all_repos +++ b/scripts/gts_run_all_repos @@ -64,5 +64,6 @@ if __name__ == "__main__": for datatype, model in zip(model_names, models): for f in p_cwd.glob(f"????-??-??-???-???-*{datatype}-stats.csv"): db.migrate_csv(f, model, engine) + f.rename(p_data / f.name) log.info("[bold dark_green]gts_run_all_repos script completed!") From 3a858631c712360b3bc3989f3c9f11b83d853bef Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 1 Aug 2022 20:17:51 -0700 Subject: [PATCH 17/22] migrate_csv: Adjust sorting of dataframe --- github_stats_pages/db.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index a2d4975..c47f6e0 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -53,7 +53,9 @@ def migrate_csv( simple_paths = ["/".join(a.split("/")[3:]) for a in df["path"].values] df["path"] = simple_paths - df.sort_values(STATS_SORT_DATAFRAME[model.__name__.lower()], inplace=True) + sort_columns = STATS_SORT_DATAFRAME[model.__name__.lower()] + log.info(f"sort_columns: {sort_columns}") + df.sort_values(by=sort_columns, inplace=True) if model.__name__ == "Paths": func = partial(query_path, engine=engine, model=model) From ec1abc6ed5612be2b0f7d0cfcd56dcc52608afe2 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 1 Aug 2022 20:23:20 -0700 Subject: [PATCH 18/22] Minor fix [ci skip] --- github_stats_pages/db.py | 1 + 1 file changed, 1 insertion(+) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index c47f6e0..a14852f 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -42,6 +42,7 @@ def migrate_csv( df = pd.read_csv(filename) df.rename(columns=RENAME_MAPPING, inplace=True) log.info(f"Size of dataframe: {len(df)}") + log.info(f"columns: {df.columns}") if "merge" not in filename.name: if model.__name__ == "Referrer": # Add date since this isn't included file_date = filename.name[:10] From a9fc8e733012616dfe5dae38dcd4b55520dd42f4 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Mon, 1 Aug 2022 20:32:36 -0700 Subject: [PATCH 19/22] Debug messaging [ci skip] --- scripts/migrate_to_sqlite | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/migrate_to_sqlite b/scripts/migrate_to_sqlite index efde911..d692ced 100755 --- a/scripts/migrate_to_sqlite +++ b/scripts/migrate_to_sqlite @@ -1,4 +1,5 @@ #!/usr/bin/env python +import os from pathlib import Path import pandas as pd @@ -42,6 +43,7 @@ if __name__ == "__main__": if merged_files: model_list = [Clone, Paths, Referrer, Traffic] for file, model in zip(merged_files, model_list): + os.system(f"head -5 {file}") db.migrate_csv(file, model=model, engine=engine) else: log.info("No merged files to migrate!") From 5d0f280b1c9ef3ba2c3bfac271ea7db10d897f6f Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Tue, 2 Aug 2022 14:15:49 -0700 Subject: [PATCH 20/22] Handle existing new columns for merged paths CSV --- github_stats_pages/db.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index a14852f..77e32ed 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -49,10 +49,17 @@ def migrate_csv( df.insert(loc=0, column="date", value=file_date) if model.__name__ == "Paths": - repository_names = [a.split("/")[2] for a in df["path"].values] - df.insert(1, "repository_name", repository_names) - simple_paths = ["/".join(a.split("/")[3:]) for a in df["path"].values] - df["path"] = simple_paths + if "repository_name" not in df.columns: + repository_names = [a.split("/")[2] for a in df["path"].values] + df.insert(1, "repository_name", repository_names) + simple_paths = [ + "/".join(a.split("/")[3:]) for a in df["path"].values + ] + df["path"] = simple_paths + else: + log.info( + f"{filename} already updated with repository_name and path" + ) sort_columns = STATS_SORT_DATAFRAME[model.__name__.lower()] log.info(f"sort_columns: {sort_columns}") From 64265974416c2055712c96a5928fc3bef0e891b8 Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 7 Aug 2022 13:09:38 -0700 Subject: [PATCH 21/22] migrate_csv: Fix empty path handling (treat as string and not NaN) --- github_stats_pages/db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/github_stats_pages/db.py b/github_stats_pages/db.py index 77e32ed..ef079d1 100644 --- a/github_stats_pages/db.py +++ b/github_stats_pages/db.py @@ -39,7 +39,7 @@ def migrate_csv( """Migrate CSV over to SQLite""" log.info(f"[yellow]Loading: {filename}") - df = pd.read_csv(filename) + df = pd.read_csv(filename, na_filter=False) df.rename(columns=RENAME_MAPPING, inplace=True) log.info(f"Size of dataframe: {len(df)}") log.info(f"columns: {df.columns}") From ea6a21ff265369a3759f9c0a55bcb8cf2205b62d Mon Sep 17 00:00:00 2001 From: Chun Ly Date: Sun, 7 Aug 2022 13:32:03 -0700 Subject: [PATCH 22/22] Fix missing header for merged referrer file --- scripts/migrate_to_sqlite | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/migrate_to_sqlite b/scripts/migrate_to_sqlite index d692ced..777af0f 100755 --- a/scripts/migrate_to_sqlite +++ b/scripts/migrate_to_sqlite @@ -37,7 +37,7 @@ if __name__ == "__main__": log.info(f"Referrer record number: {len(referrer_merged_df)}") referrer_outfile = p_data / "merged_referrer.csv" log.info(f"Writing: {referrer_outfile}") - referrer_merged_df.to_csv(referrer_outfile, header=False, index=False) + referrer_merged_df.to_csv(referrer_outfile, index=False) merged_files = [x for x in sorted(p_data.glob("merged_*.csv"))] if merged_files: