From da29ca7353a52b2f4fce62fbfbc8d95f9fca23c4 Mon Sep 17 00:00:00 2001 From: aritraroy24 Date: Thu, 2 Apr 2026 10:25:31 +0100 Subject: [PATCH 1/3] docs: update citations, readme and add API key guide for v0.1.6 --- .gitignore | 8 +- CHANGELOG.md | 14 ++ CITATION.cff | 19 +- README.md | 17 +- docs/about/changelog.md | 22 +- docs/about/citation.md | 17 +- docs/getting-started/api-key-guide.md | 322 ++++++++++++++++++++++++++ 7 files changed, 395 insertions(+), 24 deletions(-) create mode 100644 docs/getting-started/api-key-guide.md diff --git a/.gitignore b/.gitignore index a1b8848a..a3f4f2a5 100644 --- a/.gitignore +++ b/.gitignore @@ -180,4 +180,10 @@ cython_debug/ examples/db/10.* tests example/ -applications \ No newline at end of file +applications +vlm_test +examples/vlm_piezo_test + +# Claude files +CLAUDE.md +.claude \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index faeeea1e..34351bd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## [0.1.6] - 02-04-2026 +### Changed +- Updated [README.md](README.md), [CITATION.cff](CITATION.cff) and docs with the published version (advance article) of the ComProScanner paper in _Digital Discovery_ as fully open access: + - [ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature](https://doi.org/10.1039/D5DD00521C) + +### Added +- Guide for API key creation for various LLM providers and publisher APIs added to the documentation at `docs/getting-started/api-key-guide.md` with detailed instructions for each provider. + +--- ## [0.1.5] - 08-02-2026 ### Added @@ -69,6 +78,7 @@ - README badges section converted from HTML to markdown format for better compatibility across platforms. +--- ## [0.1.4] - 02-12-2025 ### Added @@ -104,6 +114,7 @@ - [ComProScanner Logo](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/comproscanner_logo.png) - [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) +--- ## [0.1.3] - 04-11-2025 ### Fixed @@ -112,6 +123,7 @@ - Changed from `from langchain.text_splitter import RecursiveCharacterTextSplitter` - To `from langchain.text_splitter.recursive_character import RecursiveCharacterTextSplitter` +--- ## [0.1.2] - 24-10-2025 ### Added @@ -119,6 +131,7 @@ - Link to ComProScanner preprint on arXiv in the documentation index page and README.md: - [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) +--- ## [0.1.1] - 22-10-2025 ### Fixed @@ -127,6 +140,7 @@ - [ComProScanner Logo](https://i.ibb.co/whHSbGvT/comproscanner-logo.png) - [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) +--- ## [0.1.0] - 22-10-2025 ### Added diff --git a/CITATION.cff b/CITATION.cff index 723c2600..56b4a26c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -16,7 +16,7 @@ contact: - family-names: Roy given-names: Aritra orcid: "https://orcid.org/0000-0002-4928-2935" -message: If you use this software, please cite our article on arXiv. +message: If you use this software, please cite our article in Digital Discovery. preferred-citation: authors: - family-names: Roy @@ -31,21 +31,28 @@ preferred-citation: - family-names: Gattinoni given-names: Chiara orcid: "https://orcid.org/0000-0002-3376-6374" - date-published: 2025-10-23 + doi: "10.1039/D5DD00521C" identifiers: + - type: doi + value: "10.1039/D5DD00521C" + description: "Peer-reviewed article" - type: other value: "arXiv:2510.20362" description: "arXiv preprint" - title: "ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature" + journal: "Digital Discovery" + publisher: + name: "RSC" + status: advance-online + title: "ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature" type: article - url: "https://arxiv.org/abs/2510.20362" + url: "https://doi.org/10.1039/D5DD00521C" repository-code: "https://github.com/slimeslab/ComProScanner" license: MIT title: "ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature" type: software url: "https://slimeslab.github.io/ComProScanner/" -version: "0.1.4" -date-released: 2025-12-03 +version: "0.1.6" +date-released: 2026-04-02 keywords: - materials science - data extraction diff --git a/README.md b/README.md index 2d5a4e05..b1609a5a 100644 --- a/README.md +++ b/README.md @@ -169,14 +169,15 @@ eval_visualizer.plot_multiple_radar_charts( If you use ComProScanner in your research, please cite: ```bibtex -@misc{roy2025comproscannermultiagentbasedframework, - title={ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature}, - author={Aritra Roy and Enrico Grisan and John Buckeridge and Chiara Gattinoni}, - year={2025}, - eprint={2510.20362}, - archivePrefix={arXiv}, - primaryClass={physics.comp-ph}, - url={https://arxiv.org/abs/2510.20362}, +@Article{roy2026comproscannermultiagentbasedframework, +author ="Roy, Aritra and Grisan, Enrico and Buckeridge, John and Gattinoni, Chiara", +title ="ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature", +journal ="Digital Discovery", +year ="2026", +pages ="Accepted", +publisher ="RSC", +doi ="10.1039/D5DD00521C", +url ="https://doi.org/10.1039/D5DD00521C" } ``` diff --git a/docs/about/changelog.md b/docs/about/changelog.md index eb6852fa..0770dc03 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -1,6 +1,16 @@ -## Unreleased +## [0.1.6] - 02-04-2026 +### Changed +- Updated [README.md](README.md), [CITATION.cff](CITATION.cff) and docs with the published version (advance article) of the ComProScanner paper in _Digital Discovery_ as fully open access: + - [ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature](https://doi.org/10.1039/D5DD00521C) + +### Added +- Guide for API key creation for various LLM providers and publisher APIs added to the documentation at `docs/getting-started/api-key-guide.md` with detailed instructions for each provider. + +--- +## [0.1.5] - 08-02-2026 ### Added +- Data related to comparison with other agentic data extraction frameworks added for the ComProScanner paper in the `examples/piezo_test/comparing_existing_frameworks` folder. - New parameter `apply_advanced_cleaning` added to data cleaning methods in `data_cleaner.py`. When set to `True`, it triggers the advanced cleaning pipeline. @@ -38,6 +48,11 @@ - [CITATION.cff](https://github.com/slimeslab/ComProScanner/blob/main/CITATION.cff) added for standardized citation information based on the latest release and arXiv preprint. ### Fixed +- OAWorks API is replaced with OpenAlex API as OAWorks is no longer available. + +- Empty/corrupted PDF handled in `pdf_processor.py` and `wiley_processor.py` to avoid having GLYPH errors during text extraction. + +- Data extraction failures fixed if composition-property text data is empty. - CSV progress tracking in `elsevier_processor.py`: @@ -63,6 +78,7 @@ - README badges section converted from HTML to markdown format for better compatibility across platforms. +--- ## [0.1.4] - 02-12-2025 ### Added @@ -96,6 +112,7 @@ - README images updated with raw GitHub links for better reliability: [ComProScanner Logo](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/comproscanner_logo.png), [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) +--- ## [0.1.3] - 04-11-2025 ### Fixed @@ -104,18 +121,21 @@ - Changed from `from langchain.text_splitter import RecursiveCharacterTextSplitter` - To `from langchain.text_splitter.recursive_character import RecursiveCharacterTextSplitter` +--- ## [0.1.2] - 24-10-2025 ### Added - Link to ComProScanner preprint on arXiv in the documentation index page and README.md: [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) +--- ## [0.1.1] - 22-10-2025 ### Fixed - README images updated with external image link to fix PyPI rendering issue. [ComProScanner Logo](https://i.ibb.co/whHSbGvT/comproscanner-logo.png), [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) +--- ## [0.1.0] - 22-10-2025 ### Added diff --git a/docs/about/citation.md b/docs/about/citation.md index b0d3be68..977b3882 100644 --- a/docs/about/citation.md +++ b/docs/about/citation.md @@ -3,13 +3,14 @@ If you use ComProScanner in your research, please cite our related paper: ```bibtex -@misc{roy2025comproscannermultiagentbasedframework, - title={ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature}, - author={Aritra Roy and Enrico Grisan and John Buckeridge and Chiara Gattinoni}, - year={2025}, - eprint={2510.20362}, - archivePrefix={arXiv}, - primaryClass={physics.comp-ph}, - url={https://arxiv.org/abs/2510.20362}, +@Article{roy2026comproscannermultiagentbasedframework, +author ="Roy, Aritra and Grisan, Enrico and Buckeridge, John and Gattinoni, Chiara", +title ="ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature", +journal ="Digital Discovery", +year ="2026", +pages ="Accepted", +publisher ="RSC", +doi ="10.1039/D5DD00521C", +url ="https://doi.org/10.1039/D5DD00521C" } ``` diff --git a/docs/getting-started/api-key-guide.md b/docs/getting-started/api-key-guide.md new file mode 100644 index 00000000..0cb974ba --- /dev/null +++ b/docs/getting-started/api-key-guide.md @@ -0,0 +1,322 @@ +# API Key Guide + +This page explains which provider credentials ComProScanner can use, what each one is used for, and how to generate or obtain them. + +## Overview + +ComProScanner can work with three groups of external providers: + +!!! important "Which credentials do you actually need?" + + | Provider group | Requirement level | + | --- | --- | + | **Publisher/content providers for article access** | **Optional** for manual or local workflows, but **required** for automated article retrieval. | + | **LLM providers for extraction, vision models and RAG** | **At least one required** for extraction, vision models and RAG workflows. However, default models are different for extraction/RAG and vision-language models. | + | **Default embedding provider for vector database creation** | **Required** unless you configure a custom embedding provider. | + +Use only the providers relevant to your workflow. You do not need every key. + +## Publisher Providers + +### Elsevier / Scopus + +Environment variable: `SCOPUS_API_KEY` + +Used for: + +- Scopus-based metadata retrieval +- Elsevier article retrieval in XML format + +How to get it: + +1. Create or sign in to your [Elsevier developer account](https://dev.elsevier.com/). +2. Open the [API key management area](https://dev.elsevier.com/apikey/manage). +3. Create a key for Scopus or content APIs. +4. Copy the generated key into your `.env` file as `SCOPUS_API_KEY`. + +```bash +SCOPUS_API_KEY=your_scopus_api_key +``` + +### Springer Nature Open Access API + +Environment variable: `SPRINGER_OPENACCESS_API_KEY` + +Used for: + +- Springer Open Access article retrieval in XML format + +How to get it: + +1. Create or sign in to your [Springer Nature account](https://dev.springernature.com/). +2. Fill up the form to request an Open Access API key at [https://dev.springernature.com/register/](https://dev.springernature.com/register/). +3. Get the Open Access API key from the [Springer Nature API management page](https://datasolutions.springernature.com/account/api-management/). +4. Copy the key into your `.env` file. + +```bash +SPRINGER_OPENACCESS_API_KEY=your_springer_openaccess_api_key +``` + +### Springer Nature TDM API + +Environment variable: `SPRINGER_TDM_API_KEY` + +Used for: + +- Springer subscription article retrieval in XML format + +How to get it: + +1. Subscribe to the Springer Nature TDM service via [https://dev.springernature.com/subscription/](https://dev.springernature.com/subscription/) and select the appropriate access level based on your institution and use case. +2. Copy the issued TDM key or token into your `.env` file. + +```bash +SPRINGER_TDM_API_KEY=your_springer_tdm_api_key +``` + +### Wiley TDM API + +Environment variable: `WILEY_API_KEY` + +Used for: + +- Wiley full-text article download as PDF + +How to get it: + +1. Create your [Wiley account](https://onlinelibrary.wiley.com/action/registration). +2. Login to your Wiley account at [https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) under the "**Get a Text and Data Mining Token**" section. +3. Accept the terms and conditions to generate your API token. +4. Copy the API token into your `.env` file. + +```bash +WILEY_API_KEY=your_wiley_api_key +``` + +### IOP Publishing + +Environment variable: `IOP_papers_path` (*not an API key but a required path variable for processing IOP Science XML files*) + +Used for: + +- Local processing of IOP Science XML files downloaded in bulk + +How to get it: + +1. Email [contentsupport@ioppublishing.org](mailto:contentsupport@ioppublishing.org) to request bulk access to the IOP Science XML files, typically through SFTP as IOP Publishing does not provide direct API access for bulk downloads. +2. Once you have access, download the XML files to a local directory. +3. Set `IOP_papers_path` to the absolute local folder path containing all the downloaded files. + +```bash +IOP_papers_path=/absolute/path/to/iop_papers +``` + +## LLM Providers + +These providers can be used for extraction models, RAG chat models, and vision-language models where supported by your configuration. + +### OpenAI + +Environment variable: `OPENAI_API_KEY` + +Typical model prefixes: `openai/...` or OpenAI model names directly + +How to get it: + +1. Create or sign in to your [OpenAI account](https://platform.openai.com/). +2. Open the [API keys section](https://platform.openai.com/api-keys). +3. Create a new secret key. +4. Store it in `.env`. + +```bash +OPENAI_API_KEY=your_openai_api_key +``` + +### Google Gemini + +Environment variable: `GEMINI_API_KEY` + +Typical model prefixes: `gemini/...` + +How to get it: + +1. Create or sign in to your [Google AI Studio account](https://aistudio.google.com/). +2. Generate an API key from the [Gemini API key page](https://aistudio.google.com/app/apikey). +3. Store it in `.env` as `GEMINI_API_KEY`. + +```bash +GEMINI_API_KEY=your_gemini_api_key +``` + +### Anthropic + +Environment variable: `ANTHROPIC_API_KEY` + +Typical model prefixes: `anthropic/...` + +How to get it: + +1. Create or sign in to your [Anthropic Console account](https://console.anthropic.com/). +2. Create a new API key from the [Anthropic keys page](https://console.anthropic.com/settings/keys). +3. Store it in `.env`. + +```bash +ANTHROPIC_API_KEY=your_anthropic_api_key +``` + +### DeepSeek + +Environment variable: `DEEPSEEK_API_KEY` + +Typical model prefixes: `deepseek/...` + +How to get it: + +1. Create or sign in to your [DeepSeek platform account](https://platform.deepseek.com/). +2. Generate an API key from the [DeepSeek API keys page](https://platform.deepseek.com/api_keys). +3. Store it in `.env`. + +```bash +DEEPSEEK_API_KEY=your_deepseek_api_key +``` + +### OpenRouter + +Environment variable: `OPENROUTER_API_KEY` + +Typical model prefixes: `openrouter/...` + +How to get it: + +1. Create or sign in to your [OpenRouter account](https://openrouter.ai/). +2. Generate an API key from the [OpenRouter keys page](https://openrouter.ai/keys). +3. Store it in `.env`. + +```bash +OPENROUTER_API_KEY=your_openrouter_api_key +``` + +### Together AI + +Environment variable: `TOGETHER_API_KEY` + +Typical model prefixes: `together_ai/...` + +How to get it: + +1. Create or sign in to your [Together AI account](https://www.together.ai/). +2. Generate an API key from the [Together AI API keys page](https://api.together.ai/settings/api-keys). +3. Store it in `.env`. + +```bash +TOGETHER_API_KEY=your_together_api_key +``` + +### Cohere + +Environment variable: `COHERE_API_KEY` + +Typical model prefixes: `cohere/...` + +How to get it: + +1. Create or sign in to your [Cohere account](https://dashboard.cohere.com/). +2. Create an API key from the [Cohere API keys page](https://dashboard.cohere.com/api-keys). +3. Store it in `.env`. + +```bash +COHERE_API_KEY=your_cohere_api_key +``` + +### Fireworks AI + +Environment variable: `FIREWORKS_API_KEY` + +Typical model prefixes: `fireworks_ai/...` + +How to get it: + +1. Create or sign in to your [Fireworks AI account](https://fireworks.ai/). +2. Generate an API key from the [Fireworks AI API keys page](https://app.fireworks.ai/settings/users/api-keys). +3. Store it in `.env`. + +```bash +FIREWORKS_API_KEY=your_fireworks_api_key +``` + +### Ollama + +Environment variable: none required + +Used for: + +- Local model inference + +How to set it up: + +1. Install Ollama from the [main Ollama website](https://ollama.com/). +2. Pull the model you want to use by following the [Ollama library and setup docs](https://ollama.com/library). +3. Set `base_url` or `rag_base_url` if needed, such as `http://localhost:11434`. + +## Default Embedding Provider + +### Hugging Face + +Environment variable: `HF_TOKEN` + +Used for: + +- Accessing the default Hugging Face embedding model workflow +- Accessing gated or rate-limited Hugging Face models +- Optional embedding/model downloads when required + +How to get it: + +1. Create or sign in to your [Hugging Face account](https://huggingface.co/). +2. Open the [access tokens page](https://huggingface.co/settings/tokens). +3. Create a new token with the required permissions. +4. Store it in `.env`. + +```bash +HF_TOKEN=your_huggingface_token +``` + +## Recommended `.env` Template + +Use the subset you need: + +```bash +# Publisher providers +SCOPUS_API_KEY=your_scopus_api_key +SPRINGER_OPENACCESS_API_KEY=your_springer_openaccess_api_key +SPRINGER_TDM_API_KEY=your_springer_tdm_api_key +WILEY_API_KEY=your_wiley_api_key +IOP_papers_path=/absolute/path/to/iop_papers + +# LLM providers +OPENAI_API_KEY=your_openai_api_key +GEMINI_API_KEY=your_gemini_api_key +ANTHROPIC_API_KEY=your_anthropic_api_key +DEEPSEEK_API_KEY=your_deepseek_api_key +OPENROUTER_API_KEY=your_openrouter_api_key +TOGETHER_API_KEY=your_together_api_key +COHERE_API_KEY=your_cohere_api_key +FIREWORKS_API_KEY=your_fireworks_api_key + +# Model and embedding access +HF_TOKEN=your_huggingface_token +``` + +## Notes + +- Keep all keys in your local `.env` file and never commit them to version control. +- For most users, the minimum setup is one publisher source plus one LLM provider. +- If you use Gemini models, use `GEMINI_API_KEY`. +- If you use the default embedding setup, make sure `HF_TOKEN` is available. + +## Related Pages + +- [Installation](installation.md) +- [Article Processing](../usage/article-processing.md) +- [Data Extraction](../usage/data-extraction.md) +- [RAG Configuration](../rag-config.md) From 5516bd792860139c8bff420d8b2196d0da40d25c Mon Sep 17 00:00:00 2001 From: aritraroy24 Date: Thu, 2 Apr 2026 10:26:04 +0100 Subject: [PATCH 2/3] chore: bump version to 0.1.6 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3c6085be..3c07ded7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "comproscanner" -version = "0.1.5" +version = "0.1.6" description = "Multi-agent system for extracting and processing structured composition-property data from scientific literature" readme = "README.md" authors = [{ name = "Aritra Roy", email = "contact@aritraroy.live" }] From 2df5f0a2d8d7318726e6bed80cad034dc3763ba0 Mon Sep 17 00:00:00 2001 From: aritraroy24 Date: Thu, 2 Apr 2026 11:11:17 +0100 Subject: [PATCH 3/3] fix: standardize model prefixes in rag_tool and update changelog --- .gitignore | 4 ---- CHANGELOG.md | 18 +++++++++++------- docs/about/changelog.md | 14 +++++++------- docs/getting-started/api-key-guide.md | 11 ++++++----- docs/rag-config.md | 6 +++--- .../extract_flow/tools/rag_tool.py | 10 +++++----- 6 files changed, 32 insertions(+), 31 deletions(-) diff --git a/.gitignore b/.gitignore index d3db6400..b031c1a1 100644 --- a/.gitignore +++ b/.gitignore @@ -186,10 +186,6 @@ applications vlm_test examples/vlm_piezo_test -# Claude files -CLAUDE.md -.claude - # Test results db results diff --git a/CHANGELOG.md b/CHANGELOG.md index 23b298d0..3695ada2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,7 +24,7 @@ - `process_articles()` now routes user-provided `doi_list` by `general_publisher` from metadata and sends each DOI only to its matching source processor. --- -## [0.1.6] - 02-04-2026 +## [0.1.6] - 2026-04-02 ### Changed - Updated [README.md](README.md), [CITATION.cff](CITATION.cff) and docs with the published version (advance article) of the ComProScanner paper in _Digital Discovery_ as fully open access: - [ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature](https://doi.org/10.1039/D5DD00521C) @@ -32,8 +32,12 @@ ### Added - Guide for API key creation for various LLM providers and publisher APIs added to the documentation at `docs/getting-started/api-key-guide.md` with detailed instructions for each provider. +### Fixed +- Model prefix handling in `rag_tool.py` standardized to reflect the docs. +- `HF_TOKEN` documentation clarified as optional — only required for gated or private Hugging Face models. + --- -## [0.1.5] - 08-02-2026 +## [0.1.5] - 2026-02-08 ### Added - Data related to comparison with other agentic data extraction frameworks added for the ComProScanner paper in the `examples/piezo_test/comparing_existing_frameworks` folder. @@ -105,7 +109,7 @@ - README badges section converted from HTML to markdown format for better compatibility across platforms. --- -## [0.1.4] - 02-12-2025 +## [0.1.4] - 2025-12-02 ### Added @@ -141,7 +145,7 @@ - [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) --- -## [0.1.3] - 04-11-2025 +## [0.1.3] - 2025-11-04 ### Fixed @@ -150,7 +154,7 @@ - To `from langchain.text_splitter.recursive_character import RecursiveCharacterTextSplitter` --- -## [0.1.2] - 24-10-2025 +## [0.1.2] - 2025-10-24 ### Added @@ -158,7 +162,7 @@ - [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) --- -## [0.1.1] - 22-10-2025 +## [0.1.1] - 2025-10-22 ### Fixed @@ -167,7 +171,7 @@ - [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) --- -## [0.1.0] - 22-10-2025 +## [0.1.0] - 2025-10-22 ### Added diff --git a/docs/about/changelog.md b/docs/about/changelog.md index 23b298d0..438b8421 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -24,7 +24,7 @@ - `process_articles()` now routes user-provided `doi_list` by `general_publisher` from metadata and sends each DOI only to its matching source processor. --- -## [0.1.6] - 02-04-2026 +## [0.1.6] - 2026-04-02 ### Changed - Updated [README.md](README.md), [CITATION.cff](CITATION.cff) and docs with the published version (advance article) of the ComProScanner paper in _Digital Discovery_ as fully open access: - [ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature](https://doi.org/10.1039/D5DD00521C) @@ -33,7 +33,7 @@ - Guide for API key creation for various LLM providers and publisher APIs added to the documentation at `docs/getting-started/api-key-guide.md` with detailed instructions for each provider. --- -## [0.1.5] - 08-02-2026 +## [0.1.5] - 2026-02-08 ### Added - Data related to comparison with other agentic data extraction frameworks added for the ComProScanner paper in the `examples/piezo_test/comparing_existing_frameworks` folder. @@ -105,7 +105,7 @@ - README badges section converted from HTML to markdown format for better compatibility across platforms. --- -## [0.1.4] - 02-12-2025 +## [0.1.4] - 2025-12-02 ### Added @@ -141,7 +141,7 @@ - [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) --- -## [0.1.3] - 04-11-2025 +## [0.1.3] - 2025-11-04 ### Fixed @@ -150,7 +150,7 @@ - To `from langchain.text_splitter.recursive_character import RecursiveCharacterTextSplitter` --- -## [0.1.2] - 24-10-2025 +## [0.1.2] - 2025-10-24 ### Added @@ -158,7 +158,7 @@ - [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) --- -## [0.1.1] - 22-10-2025 +## [0.1.1] - 2025-10-22 ### Fixed @@ -167,7 +167,7 @@ - [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) --- -## [0.1.0] - 22-10-2025 +## [0.1.0] - 2025-10-22 ### Added diff --git a/docs/getting-started/api-key-guide.md b/docs/getting-started/api-key-guide.md index 0cb974ba..81645188 100644 --- a/docs/getting-started/api-key-guide.md +++ b/docs/getting-started/api-key-guide.md @@ -200,7 +200,7 @@ OPENROUTER_API_KEY=your_openrouter_api_key Environment variable: `TOGETHER_API_KEY` -Typical model prefixes: `together_ai/...` +Typical model prefixes: `together/...` How to get it: @@ -232,7 +232,7 @@ COHERE_API_KEY=your_cohere_api_key Environment variable: `FIREWORKS_API_KEY` -Typical model prefixes: `fireworks_ai/...` +Typical model prefixes: `fireworks/...` How to get it: @@ -264,11 +264,12 @@ How to set it up: Environment variable: `HF_TOKEN` +> **Optional.** Only required for downloading gated or private Hugging Face models. Public models work without a token. + Used for: -- Accessing the default Hugging Face embedding model workflow -- Accessing gated or rate-limited Hugging Face models -- Optional embedding/model downloads when required +- Accessing gated or private Hugging Face models +- Rate-limited API access How to get it: diff --git a/docs/rag-config.md b/docs/rag-config.md index 9b843481..52376775 100644 --- a/docs/rag-config.md +++ b/docs/rag-config.md @@ -123,7 +123,7 @@ scanner.extract_composition_property_data( scanner.extract_composition_property_data( main_extraction_keyword="d33", rag_db_path="embeddings/piezo", - rag_chat_model="deepseek-chat", + rag_chat_model="deepseek/deepseek-chat", rag_max_tokens=1024, rag_top_k=4, ) @@ -178,7 +178,7 @@ scanner.extract_composition_property_data( scanner.extract_composition_property_data( main_extraction_keyword="d33", rag_db_path="embeddings/piezo", - rag_chat_model="together_ai/meta-llama/Llama-3-70b-chat-hf", + rag_chat_model="together/meta-llama/Llama-3-70b-chat-hf", rag_max_tokens=1024, rag_top_k=4, ) @@ -220,7 +220,7 @@ scanner.extract_composition_property_data( scanner.extract_composition_property_data( main_extraction_keyword="d33", rag_db_path="embeddings/piezo", - rag_chat_model="fireworks_ai/accounts/fireworks/models/llama-v3-8b-instruct", + rag_chat_model="fireworks/models/llama-v3-8b-instruct", rag_max_tokens=1024, rag_top_k=4, ) diff --git a/src/comproscanner/extract_flow/tools/rag_tool.py b/src/comproscanner/extract_flow/tools/rag_tool.py index deff690d..b8ee8220 100644 --- a/src/comproscanner/extract_flow/tools/rag_tool.py +++ b/src/comproscanner/extract_flow/tools/rag_tool.py @@ -82,28 +82,28 @@ def _get_llm(self) -> BaseChatModel: "callbacks": callbacks, } # OpenAI models - if model.startswith(("gpt-", "text-", "o1", "o3")): + if model.startswith(("openai/", "gpt-", "text-", "o1", "o3")): self._check_package_exists("langchain_openai", model) from langchain_openai import ChatOpenAI return ChatOpenAI(model=model, request_timeout=1000, **common_params) # Deepseek models - if model.startswith("deepseek"): + if model.startswith("deepseek/"): self._check_package_exists("langchain_deepseek", model) from langchain_deepseek import ChatDeepSeek return ChatDeepSeek(model=model, request_timeout=1000, **common_params) # Google Gemini models - elif model.startswith("gemini-"): + elif model.startswith("gemini/"): self._check_package_exists("langchain_google_genai", model) from langchain_google_genai import ChatGoogleGenerativeAI return ChatGoogleGenerativeAI(model=model, **common_params) # Anthropic Claude models - elif model.startswith("claude-"): + elif model.startswith("claude/"): self._check_package_exists("langchain_anthropic", model) from langchain_anthropic import ChatAnthropic @@ -143,7 +143,7 @@ def _get_llm(self) -> BaseChatModel: return ChatCohere(model=model_name, **common_params) # Fireworks models - elif model.startswith(("fireworks/", "accounts/fireworks")): + elif model.startswith(("fireworks/")): self._check_package_exists("langchain_fireworks", model) from langchain_fireworks import ChatFireworks