diff --git a/sqlite/Readme.md b/sqlite/Readme.md index 19d868bbc..dd181bf4d 100755 --- a/sqlite/Readme.md +++ b/sqlite/Readme.md @@ -39,9 +39,17 @@ bash ./sqlite/graphsample_delete.sh sqlite/xxx.db 2>&1 | tee sqlite/logs/delete_ bash ./sqlite/graphsample_delete.sh 2>&1 | tee sqlite/logs/delete_$(date +"%Y%m%d_%H%M%S").log ``` -## Merge Databases and Upload to Hugging Face +## Merge Databases ```bash -# Usage: python ./sqlite/upload.py --main_db_path --new_db_path -python ./sqlite/upload.py --main_db_path --new_db_path +# Usage: python ./sqlite/merge_db.py --main_db_path --new_db_path +python ./sqlite/merge_db.py --main_db_path sqlite/GraphNet.db --new_db_path sqlite/new.db ``` + +## Upload to Hugging Face + +```bash +python ./sqlite/upload.py +``` + +**Note:** Set `HF_TOKEN` variable in `upload.py` before running. diff --git a/sqlite/graphsample_delete.py b/sqlite/graphsample_delete.py index ef02dfc74..3da9571e1 100755 --- a/sqlite/graphsample_delete.py +++ b/sqlite/graphsample_delete.py @@ -6,6 +6,8 @@ SubgraphSource, DimensionGeneralizationSource, DataTypeGeneralizationSource, + SampleOpNameList, + SampleOpName, ) @@ -63,6 +65,13 @@ def delete_graph_sample(db_path: str, relative_model_path: str, repo_uid: str = datatype_source.deleted = True datatype_source.delete_at = delete_at + session.query(SampleOpNameList).filter( + SampleOpNameList.sample_uuid == graph_sample.uuid + ).update({"deleted": True, "delete_at": delete_at}) + session.query(SampleOpName).filter( + SampleOpName.sample_uuid == graph_sample.uuid + ).update({"deleted": True, "delete_at": delete_at}) + session.commit() print(f"Successfully deleted: {relative_model_path}") return True diff --git a/sqlite/graphsample_insert.py b/sqlite/graphsample_insert.py index 962a4073c..404819601 100755 --- a/sqlite/graphsample_insert.py +++ b/sqlite/graphsample_insert.py @@ -11,7 +11,10 @@ SubgraphSource, DimensionGeneralizationSource, DataTypeGeneralizationSource, + SampleOpName, + SampleOpNameList, ) +from sqlalchemy import delete as sql_delete from sqlalchemy.exc import IntegrityError @@ -82,7 +85,7 @@ def insert_subgraph_source( if not full_graph: raise ValueError(f"Full graph not found for path: {parent_relative_path}") - range_info = _get_range_info(model_path_prefix, relative_model_path) + range_info = _get_parent_key_and_range(model_path_prefix, relative_model_path) subgraph_source = SubgraphSource( subgraph_uuid=subgraph_uuid, full_graph_uuid=full_graph.uuid, @@ -108,26 +111,6 @@ def insert_subgraph_source( session.close() -def _get_range_info(model_path_prefix: str, relative_model_path: str): - model_path = Path(model_path_prefix) / relative_model_path - subgraph_sources_file = model_path / "subgraph_sources.json" - if not subgraph_sources_file.exists(): - return {"start": -1, "end": -1} - - try: - with open(subgraph_sources_file) as f: - data = json.load(f) - for key, ranges in data.items(): - if isinstance(ranges, list): - r = ranges[0] - if isinstance(r, list) and len(r) == 2: - return {"start": r[0], "end": r[1]} - return {"start": -1, "end": -1} - except (json.JSONDecodeError, KeyError, TypeError, IndexError) as e: - print(f"Warning: Failed to parse {subgraph_sources_file}: {e}") - return {"start": -1, "end": -1} - - def get_parent_relative_path(relative_path: str) -> str: if "_decomposed" not in relative_path: return None @@ -274,6 +257,119 @@ def _get_data_type(model_path_prefix: str, relative_model_path: str): return "todo" +# SampleOpNameList and SampleOpName insert func +def _get_parent_key_and_range(model_path_prefix: str, relative_model_path: str) -> dict: + model_path = Path(model_path_prefix) / relative_model_path + subgraph_sources_file = model_path / "subgraph_sources.json" + if not subgraph_sources_file.exists(): + return {"parent_key": "", "start": -1, "end": -1} + + try: + with open(subgraph_sources_file) as f: + data = json.load(f) + for key, ranges in data.items(): + if isinstance(ranges, list) and len(ranges) > 0: + r = ranges[0] + if isinstance(r, list) and len(r) == 2: + return {"parent_key": key, "start": r[0], "end": r[1]} + return {"parent_key": "", "start": -1, "end": -1} + except (json.JSONDecodeError, KeyError, TypeError, IndexError) as e: + print(f"Warning: Failed to parse {subgraph_sources_file}: {e}") + return {"parent_key": "", "start": -1, "end": -1} + + +def insert_sample_op_name_list( + sample_uuid: str, + model_path_prefix: str, + op_names_path_prefix: str, + relative_model_path: str, + db_path: str, +): + if not op_names_path_prefix: + print("op_names_path_prefix not provided, skipping insert_sample_op_name_list") + return + + range_info = _get_parent_key_and_range(model_path_prefix, relative_model_path) + parent_key = range_info["parent_key"] + start = range_info["start"] + end = range_info["end"] + + if start == -1 or end == -1 or not parent_key: + print( + f"Invalid range info for {relative_model_path}, skipping insert_sample_op_name_list" + ) + return + + op_size = end - start + op_names_file = Path(op_names_path_prefix) / parent_key / "op_names.txt" + if not op_names_file.exists(): + print( + f"op_names.txt not found at {op_names_file}, skipping insert_sample_op_name_list" + ) + return + + try: + with open(op_names_file) as f: + all_op_names = [line.strip() for line in f.readlines() if line.strip()] + except Exception as e: + print(f"Warning: Failed to read {op_names_file}: {e}") + return + + op_start = start + op_end = end + if op_end > len(all_op_names): + print(f"Warning: op_end {op_end} exceeds total ops {len(all_op_names)}") + op_end = len(all_op_names) + if op_start >= op_end: + print(f"Warning: op_start {op_start} >= op_end {op_end}") + return + + selected_op_names = all_op_names[op_start:op_end] + op_names_json = json.dumps( + [{"op_name": name, "op_idx": i} for i, name in enumerate(selected_op_names)] + ) + session = get_session(db_path) + try: + session.execute( + sql_delete(SampleOpNameList).where( + SampleOpNameList.sample_uuid == sample_uuid + ) + ) + session.execute( + sql_delete(SampleOpName).where(SampleOpName.sample_uuid == sample_uuid) + ) + sample_op_name_list = SampleOpNameList( + sample_uuid=sample_uuid, + op_names_json=op_names_json, + create_at=datetime.now(), + deleted=False, + delete_at=None, + ) + session.add(sample_op_name_list) + + for idx, op_name in enumerate(selected_op_names): + sample_op_name = SampleOpName( + sample_uuid=sample_uuid, + op_name=op_name, + op_idx=idx, + op_size=op_size, + create_at=datetime.now(), + deleted=False, + delete_at=None, + ) + session.add(sample_op_name) + + session.commit() + print( + f"Inserted {len(selected_op_names)} op_names for sample_uuid={sample_uuid}" + ) + except IntegrityError as e: + session.rollback() + raise e + finally: + session.close() + + # main func def main(args): data = get_graph_sample_data( @@ -294,6 +390,13 @@ def main(args): relative_model_path=args.relative_model_path, db_path=args.db_path, ) + insert_sample_op_name_list( + sample_uuid=data["uuid"], + model_path_prefix=args.model_path_prefix, + op_names_path_prefix=args.op_names_path_prefix, + relative_model_path=args.relative_model_path, + db_path=args.db_path, + ) if args.sample_type in ["fusible_graph"]: insert_dimension_generalization_source( subgraph_source_data["subgraph_uuid"], @@ -358,5 +461,11 @@ def main(args): default="graphnet.db", help="Database file path e.g 'graphnet.db'", ) + parser.add_argument( + "--op_names_path_prefix", + type=str, + required=False, + help="Path prefix of op names file", + ) args = parser.parse_args() main(args) diff --git a/sqlite/graphsample_insert.sh b/sqlite/graphsample_insert.sh index d860a4d5f..c8c4f1940 100755 --- a/sqlite/graphsample_insert.sh +++ b/sqlite/graphsample_insert.sh @@ -3,14 +3,13 @@ set -x GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))") DB_PATH="${1:-${GRAPH_NET_ROOT}/sqlite/GraphNet.db}" -TORCH_MODEL_LIST="graph_net/config/small10_torch_samples_list.txt" +TORCH_MODEL_LIST="graph_net/config/torch_samples_list.txt" PADDLE_MODEL_LIST="graph_net/config/small10_paddle_samples_list.txt" -TYPICAL_GRAPH_SAMPLES_LIST="20260202_small10/range_decomposed_subgraph_sample_list.txt" -FUSIBLE_GRAPH_SAMPLES_LIST="20260202_small10/workspace_dimension_subgraph_samples/all_dimension_subgraph_list.txt" -SOLE_OP_GRAPH_SAMPLES_LIST="20260202_small10/sole/solo_sample_list.txt" +TYPICAL_GRAPH_SAMPLES_LIST="subgraph_dataset_20260203/deduplicated_subgraph_sample_list.txt" +FUSIBLE_GRAPH_SAMPLES_LIST="subgraph_dataset_20260203/deduplicated_dimension_generalized_subgraph_sample_list.txt" +SOLE_OP_GRAPH_SAMPLES_LIST="subgraph_dataset_20260203/sole/solo_sample_list.txt" ORDER_VALUE=0 - if [ ! -f "$DB_PATH" ]; then echo "Fail ! No Database ! : $DB_PATH" exit 1 @@ -21,7 +20,7 @@ while IFS= read -r model_rel_path; do python3 "${GRAPH_NET_ROOT}/sqlite/graphsample_insert.py" \ --model_path_prefix "$GRAPH_NET_ROOT" \ --relative_model_path "$model_rel_path" \ - --repo_uid "github_torch_samples" \ + --repo_uid "hf_torch_samples" \ --sample_type "full_graph" \ --order_value "$ORDER_VALUE" \ --db_path "$DB_PATH" @@ -35,7 +34,7 @@ while IFS= read -r model_rel_path; do python3 "${GRAPH_NET_ROOT}/sqlite/graphsample_insert.py" \ --model_path_prefix "$GRAPH_NET_ROOT" \ --relative_model_path "$model_rel_path" \ - --repo_uid "github_paddle_samples" \ + --repo_uid "hf_paddle_samples" \ --sample_type "full_graph" \ --order_value "$ORDER_VALUE" \ --db_path "$DB_PATH" @@ -47,9 +46,10 @@ done < "$PADDLE_MODEL_LIST" while IFS= read -r model_rel_path; do echo "insert : $model_rel_path" python3 "${GRAPH_NET_ROOT}/sqlite/graphsample_insert.py" \ - --model_path_prefix "${GRAPH_NET_ROOT}/20260202_small10/range_decompose" \ + --model_path_prefix "${GRAPH_NET_ROOT}/subgraph_dataset_20260203/typical_graph" \ --relative_model_path "$model_rel_path" \ - --repo_uid "github_torch_samples" \ + --op_names_path_prefix "${GRAPH_NET_ROOT}/subgraph_dataset_20260203/03_sample_op_names" \ + --repo_uid "hf_torch_samples" \ --sample_type "typical_graph" \ --order_value "$ORDER_VALUE" \ --db_path "$DB_PATH" @@ -61,9 +61,10 @@ done < "$TYPICAL_GRAPH_SAMPLES_LIST" while IFS= read -r model_rel_path; do echo "insert : $model_rel_path" python3 "${GRAPH_NET_ROOT}/sqlite/graphsample_insert.py" \ - --model_path_prefix "${GRAPH_NET_ROOT}/20260202_small10/workspace_dimension_subgraph_samples" \ + --model_path_prefix "${GRAPH_NET_ROOT}/subgraph_dataset_20260203/fusible_graph" \ --relative_model_path "$model_rel_path" \ - --repo_uid "github_torch_samples" \ + --op_names_path_prefix "${GRAPH_NET_ROOT}/subgraph_dataset_20260203/03_sample_op_names" \ + --repo_uid "hf_torch_samples" \ --sample_type "fusible_graph" \ --order_value "$ORDER_VALUE" \ --db_path "$DB_PATH" @@ -75,9 +76,10 @@ done < "$FUSIBLE_GRAPH_SAMPLES_LIST" while IFS= read -r model_rel_path; do echo "insert : $model_rel_path" python3 "${GRAPH_NET_ROOT}/sqlite/graphsample_insert.py" \ - --model_path_prefix "${GRAPH_NET_ROOT}/20260202_small10/sole" \ + --model_path_prefix "${GRAPH_NET_ROOT}/subgraph_dataset_20260203/sole_op_graph" \ --relative_model_path "$model_rel_path" \ - --repo_uid "github_torch_samples" \ + --op_names_path_prefix "${GRAPH_NET_ROOT}/subgraph_dataset_20260203/03_sample_op_names" \ + --repo_uid "hf_torch_samples" \ --sample_type "sole_op_graph" \ --order_value "$ORDER_VALUE" \ --db_path "$DB_PATH" diff --git a/sqlite/merge_db.py b/sqlite/merge_db.py new file mode 100644 index 000000000..7c0e6fd0b --- /dev/null +++ b/sqlite/merge_db.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +import argparse +from orm_models import ( + get_session, + Repo, + GraphSample, + SubgraphSource, + DimensionGeneralizationSource, + DataTypeGeneralizationSource, + BackwardGraphSource, + SampleOpName, + SampleOpNameList, +) +from sqlalchemy.exc import IntegrityError + + +def merge_databases(main_db_path: str, new_db_path: str): + main_session = get_session(main_db_path) + new_session = get_session(new_db_path) + + stats = { + "repo": 0, + "graph_sample": 0, + "subgraph_source": 0, + "dimension_generalization_source": 0, + "datatype_generalization_source": 0, + "backward_graph_source": 0, + "sample_op_name": 0, + "sample_op_name_list": 0, + } + + try: + existing_repo_uids = { + r.repo_uid for r in main_session.query(Repo.repo_uid).all() + } + new_repos = ( + new_session.query(Repo) + .filter(Repo.repo_uid.notin_(existing_repo_uids)) + .all() + ) + for repo in new_repos: + new_repo = Repo( + repo_uid=repo.repo_uid, + repo_type=repo.repo_type, + repo_name=repo.repo_name, + repo_url=repo.repo_url, + ) + main_session.add(new_repo) + stats["repo"] += 1 + main_session.commit() + + subgraph_map = { + s.subgraph_uuid: s for s in new_session.query(SubgraphSource).all() + } + dim_gen_map = { + s.generalized_graph_uuid: s + for s in new_session.query(DimensionGeneralizationSource).all() + } + dtype_gen_map = { + s.generalized_graph_uuid: s + for s in new_session.query(DataTypeGeneralizationSource).all() + } + backward_map = { + s.backward_graph_uuid: s + for s in new_session.query(BackwardGraphSource).all() + } + sample_op_name_map = { + (s.sample_uuid, s.op_idx): s for s in new_session.query(SampleOpName).all() + } + sample_op_name_list_map = { + s.sample_uuid: s for s in new_session.query(SampleOpNameList).all() + } + + existing_graph_uuids = { + g.uuid for g in main_session.query(GraphSample.uuid).all() + } + existing_paths = { + (g.relative_model_path, g.repo_uid) + for g in main_session.query( + GraphSample.relative_model_path, GraphSample.repo_uid + ).all() + } + new_graph_samples = ( + new_session.query(GraphSample) + .filter(GraphSample.uuid.notin_(existing_graph_uuids)) + .all() + ) + new_graph_samples.sort(key=lambda x: 0 if x.sample_type == "full_graph" else 1) + + for sample in new_graph_samples: + if (sample.relative_model_path, sample.repo_uid) in existing_paths: + continue + new_sample = GraphSample( + uuid=sample.uuid, + repo_uid=sample.repo_uid, + relative_model_path=sample.relative_model_path, + sample_type=sample.sample_type, + is_subgraph=sample.is_subgraph, + num_ops=sample.num_ops, + graph_hash=sample.graph_hash, + order_value=sample.order_value, + create_at=sample.create_at, + deleted=sample.deleted, + delete_at=sample.delete_at, + ) + main_session.add(new_sample) + stats["graph_sample"] += 1 + + if sample.uuid in subgraph_map: + src = subgraph_map[sample.uuid] + new_subgraph = SubgraphSource( + subgraph_uuid=src.subgraph_uuid, + full_graph_uuid=src.full_graph_uuid, + range_start=src.range_start, + range_end=src.range_end, + create_at=src.create_at, + deleted=src.deleted, + delete_at=src.delete_at, + ) + main_session.add(new_subgraph) + stats["subgraph_source"] += 1 + + if sample.uuid in dim_gen_map: + src = dim_gen_map[sample.uuid] + new_dim = DimensionGeneralizationSource( + generalized_graph_uuid=src.generalized_graph_uuid, + original_graph_uuid=src.original_graph_uuid, + total_element_size=src.total_element_size, + create_at=src.create_at, + deleted=src.deleted, + delete_at=src.delete_at, + ) + main_session.add(new_dim) + stats["dimension_generalization_source"] += 1 + + if sample.uuid in dtype_gen_map: + src = dtype_gen_map[sample.uuid] + new_dtype = DataTypeGeneralizationSource( + generalized_graph_uuid=src.generalized_graph_uuid, + original_graph_uuid=src.original_graph_uuid, + data_type=src.data_type, + create_at=src.create_at, + deleted=src.deleted, + delete_at=src.delete_at, + ) + main_session.add(new_dtype) + stats["datatype_generalization_source"] += 1 + + if sample.uuid in backward_map: + src = backward_map[sample.uuid] + new_back = BackwardGraphSource( + backward_graph_uuid=src.backward_graph_uuid, + forward_graph_uuid=src.forward_graph_uuid, + original_graph_uuid=src.original_graph_uuid, + create_at=src.create_at, + deleted=src.deleted, + delete_at=src.delete_at, + ) + main_session.add(new_back) + stats["backward_graph_source"] += 1 + + if sample.uuid in sample_op_name_list_map: + src = sample_op_name_list_map[sample.uuid] + new_op_name_list = SampleOpNameList( + sample_uuid=src.sample_uuid, + op_names_json=src.op_names_json, + create_at=src.create_at, + deleted=src.deleted, + delete_at=src.delete_at, + ) + main_session.add(new_op_name_list) + stats["sample_op_name_list"] += 1 + + for op_idx in range(sample.num_ops): + key = (sample.uuid, op_idx) + if key in sample_op_name_map: + src = sample_op_name_map[key] + new_op_name = SampleOpName( + sample_uuid=src.sample_uuid, + op_name=src.op_name, + op_idx=src.op_idx, + op_size=src.op_size, + create_at=src.create_at, + deleted=src.deleted, + delete_at=src.delete_at, + ) + main_session.add(new_op_name) + stats["sample_op_name"] += 1 + + main_session.commit() + + total_merged = sum(stats.values()) + print( + f"Total merged: {total_merged} records from {new_db_path} into {main_db_path}" + ) + print(f" Breakdown: {stats}") + + return stats + + except IntegrityError as e: + main_session.rollback() + print(f"IntegrityError during merge: {e}") + return stats + except Exception as e: + main_session.rollback() + print(f"Error during merge: {e}") + return stats + finally: + new_session.close() + main_session.close() + + +def main(): + parser = argparse.ArgumentParser( + description="Merge new database into main GraphNet database" + ) + parser.add_argument( + "--new_db_path", + type=str, + required=True, + help="Path to new database file (e.g., /path/to/new.db)", + ) + parser.add_argument( + "--main_db_path", + type=str, + required=True, + help="Path to main GraphNet.db (e.g., /path/to/GraphNet.db)", + ) + + args = parser.parse_args() + + stats = merge_databases(args.main_db_path, args.new_db_path) + print("Merge Summary:") + total_merged = sum(stats.values()) + print(f" Total records merged: {total_merged}") + for table, count in stats.items(): + print(f" - {table}: {count}") + + +if __name__ == "__main__": + main() diff --git a/sqlite/migrates/create_main_tables_2026-02-02-031353.sql b/sqlite/migrates/create_main_tables_2026-02-02-031353.sql old mode 100644 new mode 100755 index 67cd89393..23d135125 --- a/sqlite/migrates/create_main_tables_2026-02-02-031353.sql +++ b/sqlite/migrates/create_main_tables_2026-02-02-031353.sql @@ -7,8 +7,8 @@ CREATE TABLE IF NOT EXISTS repo ( repo_url TEXT ); INSERT OR IGNORE INTO repo (repo_uid, repo_type, repo_name, repo_url) VALUES -('github_torch_samples', 'github', 'GraphNet', 'https://github.com/PaddlePaddle/GraphNet'), -('github_paddle_samples', 'github', 'GraphNet', 'https://github.com/PaddlePaddle/GraphNet'); +('hf_torch_samples', 'huggingface', 'GraphNet', 'https://huggingface.co/datasets/PaddlePaddle/GraphNet'), +('hf_paddle_samples', 'huggingface', 'GraphNet', 'https://huggingface.co/datasets/PaddlePaddle/GraphNet'); -- create graph_sample table diff --git a/sqlite/migrates/create_main_tables_2026-02-11-034320.sql b/sqlite/migrates/create_main_tables_2026-02-11-034320.sql new file mode 100755 index 000000000..c084e05df --- /dev/null +++ b/sqlite/migrates/create_main_tables_2026-02-11-034320.sql @@ -0,0 +1,26 @@ +--create table sample_op_name +CREATE TABLE IF NOT EXISTS sample_op_name ( + sample_uuid VARCHAR(255) NOT NULL, + op_name VARCHAR(255) NOT NULL, + op_idx INTEGER NOT NULL, + op_size INTEGER NOT NULL, + create_at DATETIME DEFAULT CURRENT_TIMESTAMP, + delete_at DATETIME, + deleted BOOLEAN DEFAULT FALSE, + FOREIGN KEY (sample_uuid) REFERENCES graph_sample(uuid), + PRIMARY KEY (sample_uuid, op_idx) +); +CREATE INDEX IF NOT EXISTS idx_sample_op_name_sample_uuid ON sample_op_name (sample_uuid); +CREATE INDEX IF NOT EXISTS idx_sample_op_name_op_name ON sample_op_name (op_name); + + +--create table sample_op_name_list +CREATE TABLE IF NOT EXISTS sample_op_name_list ( + sample_uuid VARCHAR(255) NOT NULL PRIMARY KEY, + op_names_json TEXT NOT NULL, + create_at DATETIME DEFAULT CURRENT_TIMESTAMP, + delete_at DATETIME, + deleted BOOLEAN DEFAULT FALSE, + FOREIGN KEY (sample_uuid) REFERENCES graph_sample(uuid) +); +CREATE INDEX IF NOT EXISTS idx_sample_op_name_list_op_names ON sample_op_name_list (op_names_json); diff --git a/sqlite/orm_models.py b/sqlite/orm_models.py old mode 100644 new mode 100755 index a9fd5cb9a..c48278f95 --- a/sqlite/orm_models.py +++ b/sqlite/orm_models.py @@ -10,6 +10,7 @@ ForeignKey, Index, UniqueConstraint, + Text, ) Base = declarative_base() @@ -95,6 +96,17 @@ class GraphSample(Base): foreign_keys="BackwardGraphSource.original_graph_uuid", back_populates="original_graph", ) + sample_op_names = relationship( + "SampleOpName", + foreign_keys="SampleOpName.sample_uuid", + back_populates="sample", + ) + sample_op_name_list = relationship( + "SampleOpNameList", + foreign_keys="SampleOpNameList.sample_uuid", + back_populates="sample", + uselist=False, + ) class SubgraphSource(Base): @@ -229,6 +241,43 @@ class BackwardGraphSource(Base): ) +class SampleOpName(Base): + __tablename__ = "sample_op_name" + + sample_uuid = Column( + String(255), ForeignKey("graph_sample.uuid"), nullable=False, primary_key=True + ) + op_idx = Column(Integer, nullable=False, primary_key=True) + op_name = Column(String(255), nullable=False) + op_size = Column(Integer, nullable=False) + create_at = Column(DateTime, default=datetime.now) + delete_at = Column(DateTime) + deleted = Column(Boolean, default=False) + + __table_args__ = ( + Index("idx_sample_op_name_sample_uuid", "sample_uuid"), + Index("idx_sample_op_name_op_name", "op_name"), + ) + + sample = relationship("GraphSample", back_populates="sample_op_names") + + +class SampleOpNameList(Base): + __tablename__ = "sample_op_name_list" + + sample_uuid = Column( + String(255), ForeignKey("graph_sample.uuid"), nullable=False, primary_key=True + ) + op_names_json = Column(Text, nullable=False) + create_at = Column(DateTime, default=datetime.now) + delete_at = Column(DateTime) + deleted = Column(Boolean, default=False) + + __table_args__ = (Index("idx_sample_op_name_list_op_names", "op_names_json"),) + + sample = relationship("GraphSample", back_populates="sample_op_name_list") + + def get_session(db_path: str, echo: bool = False): engine = create_engine(f"sqlite:///{db_path}", echo=echo) Session = sessionmaker(bind=engine) diff --git a/sqlite/upload.py b/sqlite/upload.py index 2556fe8ba..72308f28a 100755 --- a/sqlite/upload.py +++ b/sqlite/upload.py @@ -1,261 +1,64 @@ -#!/usr/bin/env python3 -import argparse -from orm_models import ( - get_session, - Repo, - GraphSample, - SubgraphSource, - DimensionGeneralizationSource, - DataTypeGeneralizationSource, - BackwardGraphSource, -) -from sqlalchemy.exc import IntegrityError import os -from huggingface_hub import HfApi - -BASE_PATH = "/work/clone/GraphNet/subgraph_dataset_workspace_small10_torch_samples" -FULL_GRAPH_PATH = f"{BASE_PATH}/full_graph" -TYPICAL_GRAPH_PATH = f"{BASE_PATH}/typical_graph" -FUSIBLE_GRAPH_PATH = f"{BASE_PATH}/fusible_graph" -SOLE_OP_GRAPH_PATH = f"{BASE_PATH}/sole_op_graph" - -HF_REPO_ID = "PaddlePaddle/GraphNet" -HF_REPO_TYPE = "dataset" -HF_BRANCH = "main" -HF_TOKEN = os.environ.get("HF_TOKEN") -if not HF_TOKEN: - raise ValueError("HF_TOKEN environment variable not set") -HF_LARGE_THRESHOLD_MB = 1 - - -def upload_to_huggingface(api: HfApi, local_path: str, repo_id: str, path_in_repo: str): - if not os.path.exists(local_path): - print(f"Warning: Local path not found, skipping upload: {local_path}") - return - print(f"Uploading folder {local_path} to HF: {repo_id}/{path_in_repo} ...") - try: - api.upload_folder( - folder_path=local_path, - path_in_repo=path_in_repo, - repo_id=repo_id, - repo_type=HF_REPO_TYPE, - revision=HF_BRANCH, - commit_message=f"Upload folder: {path_in_repo}", - ignore_patterns=["**/__pycache__/*", "*.pyc", ".ipynb_checkpoints/*"], - ) - print(f"Success: Folder uploaded to {path_in_repo}") - except Exception as e: - print(f"Error uploading to Hugging Face: {e}") - - -def merge_databases(main_db_path: str, new_db_path: str): - main_session = get_session(main_db_path) - new_session = get_session(new_db_path) - - stats = { - "repo": 0, - "graph_sample": 0, - "subgraph_source": 0, - "dimension_generalization_source": 0, - "datatype_generalization_source": 0, - "backward_graph_source": 0, - } - - try: - existing_repo_uids = { - r.repo_uid for r in main_session.query(Repo.repo_uid).all() - } - new_repos = ( - new_session.query(Repo) - .filter(Repo.repo_uid.notin_(existing_repo_uids)) - .all() - ) - for repo in new_repos: - new_repo = Repo( - repo_uid=repo.repo_uid, - repo_type=repo.repo_type, - repo_name=repo.repo_name, - repo_url=repo.repo_url, - ) - main_session.add(new_repo) - stats["repo"] += 1 - main_session.commit() - - subgraph_map = { - s.subgraph_uuid: s for s in new_session.query(SubgraphSource).all() - } - dim_gen_map = { - s.generalized_graph_uuid: s - for s in new_session.query(DimensionGeneralizationSource).all() - } - dtype_gen_map = { - s.generalized_graph_uuid: s - for s in new_session.query(DataTypeGeneralizationSource).all() - } - backward_map = { - s.backward_graph_uuid: s - for s in new_session.query(BackwardGraphSource).all() - } +from datasets import Dataset +from huggingface_hub import HfApi, login + + +HF_TOKEN = "" +REPO_ID = "PaddlePaddle/GraphNet" +REVISION = "20260203" +BASE_DIR = "/work/GraphNet/torch_paddle_samples/subgraph_dataset_20260203" +FOLDERS_TO_PACK = ["full_graph", "fusible_graph", "sole_op_graph", "typical_graph"] +DB_FILE = "GraphNet.db" + + +def is_clean_file(filename, root): + ext = os.path.splitext(filename)[1].lower() + if ext in {".pyc", ".pyo", ".pyd", ".so"}: + return False + if any(x in root for x in ["__pycache__", ".git", ".ipynb_checkpoints"]): + return False + return True + + +def file_generator(): + file_list = [ + (os.path.join(root, f), folder) + for folder in FOLDERS_TO_PACK + if os.path.exists(os.path.join(BASE_DIR, folder)) + for root, _, files in os.walk(os.path.join(BASE_DIR, folder)) + for f in files + if is_clean_file(f, root) + and os.path.splitext(f)[1].lower() in {".py", ".json", ".txt", ".yaml", ".md"} + ] - existing_graph_uuids = { - g.uuid for g in main_session.query(GraphSample.uuid).all() - } - existing_paths = { - (g.relative_model_path, g.repo_uid) - for g in main_session.query( - GraphSample.relative_model_path, GraphSample.repo_uid - ).all() + return ( + { + "path": os.path.relpath(fp, BASE_DIR), + "content": open(fp, "r", encoding="utf-8", errors="ignore").read(), + "source_folder": src, } - new_graph_samples = ( - new_session.query(GraphSample) - .filter(GraphSample.uuid.notin_(existing_graph_uuids)) - .all() - ) - new_graph_samples.sort(key=lambda x: 0 if x.sample_type == "full_graph" else 1) - - for sample in new_graph_samples: - if (sample.relative_model_path, sample.repo_uid) in existing_paths: - continue - new_sample = GraphSample( - uuid=sample.uuid, - repo_uid=sample.repo_uid, - relative_model_path=sample.relative_model_path, - sample_type=sample.sample_type, - is_subgraph=sample.is_subgraph, - num_ops=sample.num_ops, - graph_hash=sample.graph_hash, - order_value=sample.order_value, - create_at=sample.create_at, - deleted=sample.deleted, - delete_at=sample.delete_at, - ) - main_session.add(new_sample) - stats["graph_sample"] += 1 - - if sample.uuid in subgraph_map: - src = subgraph_map[sample.uuid] - new_subgraph = SubgraphSource( - subgraph_uuid=src.subgraph_uuid, - full_graph_uuid=src.full_graph_uuid, - range_start=src.range_start, - range_end=src.range_end, - create_at=src.create_at, - deleted=src.deleted, - delete_at=src.delete_at, - ) - main_session.add(new_subgraph) - stats["subgraph_source"] += 1 - - if sample.uuid in dim_gen_map: - src = dim_gen_map[sample.uuid] - new_dim = DimensionGeneralizationSource( - generalized_graph_uuid=src.generalized_graph_uuid, - original_graph_uuid=src.original_graph_uuid, - total_element_size=src.total_element_size, - create_at=src.create_at, - deleted=src.deleted, - delete_at=src.delete_at, - ) - main_session.add(new_dim) - stats["dimension_generalization_source"] += 1 - - if sample.uuid in dtype_gen_map: - src = dtype_gen_map[sample.uuid] - new_dtype = DataTypeGeneralizationSource( - generalized_graph_uuid=src.generalized_graph_uuid, - original_graph_uuid=src.original_graph_uuid, - data_type=src.data_type, - create_at=src.create_at, - deleted=src.deleted, - delete_at=src.delete_at, - ) - main_session.add(new_dtype) - stats["datatype_generalization_source"] += 1 - - if sample.uuid in backward_map: - src = backward_map[sample.uuid] - new_back = BackwardGraphSource( - backward_graph_uuid=src.backward_graph_uuid, - forward_graph_uuid=src.forward_graph_uuid, - create_at=src.create_at, - deleted=src.deleted, - delete_at=src.delete_at, - ) - main_session.add(new_back) - stats["backward_graph_source"] += 1 - - main_session.commit() - - total_merged = sum(stats.values()) - print( - f"Total merged: {total_merged} records from {new_db_path} into {main_db_path}" - ) - print(f" Breakdown: {stats}") - - return stats - - except IntegrityError as e: - main_session.rollback() - print(f"IntegrityError during merge: {e}") - return stats - except Exception as e: - main_session.rollback() - print(f"Error during merge: {e}") - return stats - finally: - new_session.close() - main_session.close() - - -def main(): - parser = argparse.ArgumentParser( - description="Merge new database into main GraphNet database" - ) - parser.add_argument( - "--new_db_path", - type=str, - required=True, - help="Path to new database file (e.g., /path/to/new.db)", + for fp, src in file_list ) - parser.add_argument( - "--main_db_path", - type=str, - required=True, - help="Path to main GraphNet.db (e.g., /path/to/GraphNet.db)", - ) - - args = parser.parse_args() - print("SREP 1: Merging databases...") - stats = merge_databases(args.main_db_path, args.new_db_path) - print("Merge Summary:") - total_merged = sum(stats.values()) - print(f" Total records merged: {total_merged}") - for table, count in stats.items(): - print(f" - {table}: {count}") - print("STEP 2: Uploading samples to Hugging Face...") - api = HfApi(token=HF_TOKEN) - folders_to_upload = [ - (FULL_GRAPH_PATH, "full_graph"), - (TYPICAL_GRAPH_PATH, "typical_graph"), - (FUSIBLE_GRAPH_PATH, "fusible_graph"), - (SOLE_OP_GRAPH_PATH, "sole_op_graph"), - ] - for local_p, remote_p in folders_to_upload: - upload_to_huggingface(api, local_p, HF_REPO_ID, remote_p) - - print("STEP 3: Uploading database to Hugging Face...") - # Upload main database to HF - api.upload_file( - path_or_fileobj=args.main_db_path, - path_in_repo="GraphNet.db", - repo_id=HF_REPO_ID, - repo_type=HF_REPO_TYPE, - revision=HF_BRANCH, - commit_message="Update GraphNet.db database", - ) - print(f"Success: Uploaded database {args.main_db_path}") +def main(): + login(token=HF_TOKEN) + + ds = Dataset.from_generator(file_generator) + ds.push_to_hub(REPO_ID, split="GraphNet", max_shard_size="500MB", revision=REVISION) + print("Folder data uploaded successfully!") + + api = HfApi() + db_path = os.path.join(BASE_DIR, DB_FILE) + if os.path.exists(db_path): + api.upload_file( + path_or_fileobj=db_path, + path_in_repo=DB_FILE, + repo_id=REPO_ID, + repo_type="dataset", + revision=REVISION, + ) + print(f"{DB_FILE} uploaded successfully!") if __name__ == "__main__":