diff --git a/CHANGELOG.md b/CHANGELOG.md index b1a45326..3695ada2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,4 @@ -## [Unreleased] - -### Added - +# Unreleased - New `value_error_thresholds` parameter added to both `evaluate_semantic()` and `evaluate_agentic()` for range-based absolute error tolerances on numeric property value comparisons: - Accepts a dict mapping `(min, max)` tuples to absolute error thresholds. When a ground-truth value falls inside a range, the extracted value is accepted if `|extracted - ground_truth| ≤ threshold`. Values outside all configured ranges fall back to exact comparison. @@ -26,7 +23,21 @@ - `process_articles()` now routes user-provided `doi_list` by `general_publisher` from metadata and sends each DOI only to its matching source processor. -## [0.1.5] - 08-02-2026 +--- +## [0.1.6] - 2026-04-02 +### Changed +- Updated [README.md](README.md), [CITATION.cff](CITATION.cff) and docs with the published version (advance article) of the ComProScanner paper in _Digital Discovery_ as fully open access: + - [ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature](https://doi.org/10.1039/D5DD00521C) + +### Added +- Guide for API key creation for various LLM providers and publisher APIs added to the documentation at `docs/getting-started/api-key-guide.md` with detailed instructions for each provider. + +### Fixed +- Model prefix handling in `rag_tool.py` standardized to reflect the docs. +- `HF_TOKEN` documentation clarified as optional — only required for gated or private Hugging Face models. + +--- +## [0.1.5] - 2026-02-08 ### Added - Data related to comparison with other agentic data extraction frameworks added for the ComProScanner paper in the `examples/piezo_test/comparing_existing_frameworks` folder. @@ -97,7 +108,8 @@ - README badges section converted from HTML to markdown format for better compatibility across platforms. -## [0.1.4] - 02-12-2025 +--- +## [0.1.4] - 2025-12-02 ### Added @@ -132,7 +144,8 @@ - [ComProScanner Logo](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/comproscanner_logo.png) - [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) -## [0.1.3] - 04-11-2025 +--- +## [0.1.3] - 2025-11-04 ### Fixed @@ -140,14 +153,16 @@ - Changed from `from langchain.text_splitter import RecursiveCharacterTextSplitter` - To `from langchain.text_splitter.recursive_character import RecursiveCharacterTextSplitter` -## [0.1.2] - 24-10-2025 +--- +## [0.1.2] - 2025-10-24 ### Added - Link to ComProScanner preprint on arXiv in the documentation index page and README.md: - [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) -## [0.1.1] - 22-10-2025 +--- +## [0.1.1] - 2025-10-22 ### Fixed @@ -155,7 +170,8 @@ - [ComProScanner Logo](https://i.ibb.co/whHSbGvT/comproscanner-logo.png) - [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) -## [0.1.0] - 22-10-2025 +--- +## [0.1.0] - 2025-10-22 ### Added diff --git a/CITATION.cff b/CITATION.cff index 723c2600..56b4a26c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -16,7 +16,7 @@ contact: - family-names: Roy given-names: Aritra orcid: "https://orcid.org/0000-0002-4928-2935" -message: If you use this software, please cite our article on arXiv. +message: If you use this software, please cite our article in Digital Discovery. preferred-citation: authors: - family-names: Roy @@ -31,21 +31,28 @@ preferred-citation: - family-names: Gattinoni given-names: Chiara orcid: "https://orcid.org/0000-0002-3376-6374" - date-published: 2025-10-23 + doi: "10.1039/D5DD00521C" identifiers: + - type: doi + value: "10.1039/D5DD00521C" + description: "Peer-reviewed article" - type: other value: "arXiv:2510.20362" description: "arXiv preprint" - title: "ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature" + journal: "Digital Discovery" + publisher: + name: "RSC" + status: advance-online + title: "ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature" type: article - url: "https://arxiv.org/abs/2510.20362" + url: "https://doi.org/10.1039/D5DD00521C" repository-code: "https://github.com/slimeslab/ComProScanner" license: MIT title: "ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature" type: software url: "https://slimeslab.github.io/ComProScanner/" -version: "0.1.4" -date-released: 2025-12-03 +version: "0.1.6" +date-released: 2026-04-02 keywords: - materials science - data extraction diff --git a/README.md b/README.md index 2d5a4e05..b1609a5a 100644 --- a/README.md +++ b/README.md @@ -169,14 +169,15 @@ eval_visualizer.plot_multiple_radar_charts( If you use ComProScanner in your research, please cite: ```bibtex -@misc{roy2025comproscannermultiagentbasedframework, - title={ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature}, - author={Aritra Roy and Enrico Grisan and John Buckeridge and Chiara Gattinoni}, - year={2025}, - eprint={2510.20362}, - archivePrefix={arXiv}, - primaryClass={physics.comp-ph}, - url={https://arxiv.org/abs/2510.20362}, +@Article{roy2026comproscannermultiagentbasedframework, +author ="Roy, Aritra and Grisan, Enrico and Buckeridge, John and Gattinoni, Chiara", +title ="ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature", +journal ="Digital Discovery", +year ="2026", +pages ="Accepted", +publisher ="RSC", +doi ="10.1039/D5DD00521C", +url ="https://doi.org/10.1039/D5DD00521C" } ``` diff --git a/docs/about/changelog.md b/docs/about/changelog.md index 6c05e173..438b8421 100644 --- a/docs/about/changelog.md +++ b/docs/about/changelog.md @@ -1,6 +1,42 @@ -## Unreleased +# Unreleased +- New `value_error_thresholds` parameter added to both `evaluate_semantic()` and `evaluate_agentic()` for range-based absolute error tolerances on numeric property value comparisons: + + - Accepts a dict mapping `(min, max)` tuples to absolute error thresholds. When a ground-truth value falls inside a range, the extracted value is accepted if `|extracted - ground_truth| ≤ threshold`. Values outside all configured ranges fall back to exact comparison. + + - **Semantic evaluation**: handled inside `_is_value_in_range()` via the new `_get_error_threshold()` helper in `MaterialsDataSemanticEvaluator`. + + - **Agentic evaluation**: a new `GetValueErrorThresholdTool` (CrewAI `BaseTool`) is added to the composition evaluator agent when thresholds are configured. The agent calls this tool with the reference value to retrieve the tolerance before deciding on each numeric match. No tool is added and no prompt changes are made when no thresholds are provided. + +- Exposed `value_error_thresholds` in public evaluation methods: `ComProScanner.evaluate_semantic()`, `ComProScanner.evaluate_agentic()`, `comproscanner.evaluate_semantic()`, and `comproscanner.evaluate_agentic()`. + +- VLM-based graph data extraction added across all publishers and PDF processors: + + - New `GraphExtractorTool` — a CrewAI agent tool that reads saved figures for a given DOI and uses a vision LLM to extract composition-property value pairs from graphs and charts. Default VLM: `gemini/gemini-3-flash-preview`. + + - New `FigureExtractor` utility — shared helper for caption keyword-based figure filtering and saving, used by all article processors. + + - New `caption_keywords` parameter in `process_articles()` and `extract_composition_property_data()`, and new `vlm_model` and `related_figures_base_path` parameters in `extract_composition_property_data()`. + +- New unit tests added for all three agent tools in `tests/test_agent_tools/`. + +### Fixed + +- `process_articles()` now routes user-provided `doi_list` by `general_publisher` from metadata and sends each DOI only to its matching source processor. + +--- +## [0.1.6] - 2026-04-02 +### Changed +- Updated [README.md](README.md), [CITATION.cff](CITATION.cff) and docs with the published version (advance article) of the ComProScanner paper in _Digital Discovery_ as fully open access: + - [ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature](https://doi.org/10.1039/D5DD00521C) ### Added +- Guide for API key creation for various LLM providers and publisher APIs added to the documentation at `docs/getting-started/api-key-guide.md` with detailed instructions for each provider. + +--- +## [0.1.5] - 2026-02-08 + +### Added +- Data related to comparison with other agentic data extraction frameworks added for the ComProScanner paper in the `examples/piezo_test/comparing_existing_frameworks` folder. - New parameter `apply_advanced_cleaning` added to data cleaning methods in `data_cleaner.py`. When set to `True`, it triggers the advanced cleaning pipeline. @@ -37,9 +73,12 @@ - [CITATION.cff](https://github.com/slimeslab/ComProScanner/blob/main/CITATION.cff) added for standardized citation information based on the latest release and arXiv preprint. -- Exposed `value_error_thresholds` in public evaluation methods: `ComProScanner.evaluate_semantic()`, `ComProScanner.evaluate_agentic()`, `comproscanner.evaluate_semantic()`, and `comproscanner.evaluate_agentic()`. - ### Fixed +- OAWorks API is replaced with OpenAlex API as OAWorks is no longer available. + +- Empty/corrupted PDF handled in `pdf_processor.py` and `wiley_processor.py` to avoid having GLYPH errors during text extraction. + +- Data extraction failures fixed if composition-property text data is empty. - CSV progress tracking in `elsevier_processor.py`: @@ -61,13 +100,12 @@ - GitHub Actions CI disk space issue: - Added `--no-cache-dir` flag to pip install to reduce disk usage -- `process_articles()` now routes user-provided `doi_list` by `general_publisher` from metadata and sends each DOI only to its matching source processor. - ### Changed - README badges section converted from HTML to markdown format for better compatibility across platforms. -## [0.1.4] - 02-12-2025 +--- +## [0.1.4] - 2025-12-02 ### Added @@ -98,9 +136,12 @@ ### Changed -- README images updated with raw GitHub links for better reliability: [ComProScanner Logo](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/comproscanner_logo.png), [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) +- README images updated with raw GitHub links for better reliability: + - [ComProScanner Logo](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/comproscanner_logo.png) + - [ComProScanner Workflow](https://raw.githubusercontent.com/aritraroy24/ComProScanner/main/assets/overall_workflow.png) -## [0.1.3] - 04-11-2025 +--- +## [0.1.3] - 2025-11-04 ### Fixed @@ -108,22 +149,26 @@ - Changed from `from langchain.text_splitter import RecursiveCharacterTextSplitter` - To `from langchain.text_splitter.recursive_character import RecursiveCharacterTextSplitter` -## [0.1.2] - 24-10-2025 +--- +## [0.1.2] - 2025-10-24 ### Added -- Link to ComProScanner preprint on arXiv in the documentation index page and README.md: [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) +- Link to ComProScanner preprint on arXiv in the documentation index page and README.md: + - [arXiv:2510.20362](https://arxiv.org/abs/2510.20362) -## [0.1.1] - 22-10-2025 +--- +## [0.1.1] - 2025-10-22 ### Fixed -- README images updated with external image link to fix PyPI rendering issue. [ComProScanner Logo](https://i.ibb.co/whHSbGvT/comproscanner-logo.png), [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) +- README images updated with external image link to fix PyPI rendering issue. + - [ComProScanner Logo](https://i.ibb.co/whHSbGvT/comproscanner-logo.png) + - [ComProScanner Workflow](https://i.ibb.co/QWd2qd3/overall-workflow.png) -## [0.1.0] - 22-10-2025 +--- +## [0.1.0] - 2025-10-22 ### Added - Initial release of ComProScanner. - - diff --git a/docs/about/citation.md b/docs/about/citation.md index b0d3be68..977b3882 100644 --- a/docs/about/citation.md +++ b/docs/about/citation.md @@ -3,13 +3,14 @@ If you use ComProScanner in your research, please cite our related paper: ```bibtex -@misc{roy2025comproscannermultiagentbasedframework, - title={ComProScanner: A multi-agent based framework for composition-property structured data extraction from scientific literature}, - author={Aritra Roy and Enrico Grisan and John Buckeridge and Chiara Gattinoni}, - year={2025}, - eprint={2510.20362}, - archivePrefix={arXiv}, - primaryClass={physics.comp-ph}, - url={https://arxiv.org/abs/2510.20362}, +@Article{roy2026comproscannermultiagentbasedframework, +author ="Roy, Aritra and Grisan, Enrico and Buckeridge, John and Gattinoni, Chiara", +title ="ComProScanner: a multi-agent based framework for composition-property structured data extraction from scientific literature", +journal ="Digital Discovery", +year ="2026", +pages ="Accepted", +publisher ="RSC", +doi ="10.1039/D5DD00521C", +url ="https://doi.org/10.1039/D5DD00521C" } ``` diff --git a/docs/getting-started/api-key-guide.md b/docs/getting-started/api-key-guide.md new file mode 100644 index 00000000..81645188 --- /dev/null +++ b/docs/getting-started/api-key-guide.md @@ -0,0 +1,323 @@ +# API Key Guide + +This page explains which provider credentials ComProScanner can use, what each one is used for, and how to generate or obtain them. + +## Overview + +ComProScanner can work with three groups of external providers: + +!!! important "Which credentials do you actually need?" + + | Provider group | Requirement level | + | --- | --- | + | **Publisher/content providers for article access** | **Optional** for manual or local workflows, but **required** for automated article retrieval. | + | **LLM providers for extraction, vision models and RAG** | **At least one required** for extraction, vision models and RAG workflows. However, default models are different for extraction/RAG and vision-language models. | + | **Default embedding provider for vector database creation** | **Required** unless you configure a custom embedding provider. | + +Use only the providers relevant to your workflow. You do not need every key. + +## Publisher Providers + +### Elsevier / Scopus + +Environment variable: `SCOPUS_API_KEY` + +Used for: + +- Scopus-based metadata retrieval +- Elsevier article retrieval in XML format + +How to get it: + +1. Create or sign in to your [Elsevier developer account](https://dev.elsevier.com/). +2. Open the [API key management area](https://dev.elsevier.com/apikey/manage). +3. Create a key for Scopus or content APIs. +4. Copy the generated key into your `.env` file as `SCOPUS_API_KEY`. + +```bash +SCOPUS_API_KEY=your_scopus_api_key +``` + +### Springer Nature Open Access API + +Environment variable: `SPRINGER_OPENACCESS_API_KEY` + +Used for: + +- Springer Open Access article retrieval in XML format + +How to get it: + +1. Create or sign in to your [Springer Nature account](https://dev.springernature.com/). +2. Fill up the form to request an Open Access API key at [https://dev.springernature.com/register/](https://dev.springernature.com/register/). +3. Get the Open Access API key from the [Springer Nature API management page](https://datasolutions.springernature.com/account/api-management/). +4. Copy the key into your `.env` file. + +```bash +SPRINGER_OPENACCESS_API_KEY=your_springer_openaccess_api_key +``` + +### Springer Nature TDM API + +Environment variable: `SPRINGER_TDM_API_KEY` + +Used for: + +- Springer subscription article retrieval in XML format + +How to get it: + +1. Subscribe to the Springer Nature TDM service via [https://dev.springernature.com/subscription/](https://dev.springernature.com/subscription/) and select the appropriate access level based on your institution and use case. +2. Copy the issued TDM key or token into your `.env` file. + +```bash +SPRINGER_TDM_API_KEY=your_springer_tdm_api_key +``` + +### Wiley TDM API + +Environment variable: `WILEY_API_KEY` + +Used for: + +- Wiley full-text article download as PDF + +How to get it: + +1. Create your [Wiley account](https://onlinelibrary.wiley.com/action/registration). +2. Login to your Wiley account at [https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining](https://onlinelibrary.wiley.com/library-info/resources/text-and-datamining) under the "**Get a Text and Data Mining Token**" section. +3. Accept the terms and conditions to generate your API token. +4. Copy the API token into your `.env` file. + +```bash +WILEY_API_KEY=your_wiley_api_key +``` + +### IOP Publishing + +Environment variable: `IOP_papers_path` (*not an API key but a required path variable for processing IOP Science XML files*) + +Used for: + +- Local processing of IOP Science XML files downloaded in bulk + +How to get it: + +1. Email [contentsupport@ioppublishing.org](mailto:contentsupport@ioppublishing.org) to request bulk access to the IOP Science XML files, typically through SFTP as IOP Publishing does not provide direct API access for bulk downloads. +2. Once you have access, download the XML files to a local directory. +3. Set `IOP_papers_path` to the absolute local folder path containing all the downloaded files. + +```bash +IOP_papers_path=/absolute/path/to/iop_papers +``` + +## LLM Providers + +These providers can be used for extraction models, RAG chat models, and vision-language models where supported by your configuration. + +### OpenAI + +Environment variable: `OPENAI_API_KEY` + +Typical model prefixes: `openai/...` or OpenAI model names directly + +How to get it: + +1. Create or sign in to your [OpenAI account](https://platform.openai.com/). +2. Open the [API keys section](https://platform.openai.com/api-keys). +3. Create a new secret key. +4. Store it in `.env`. + +```bash +OPENAI_API_KEY=your_openai_api_key +``` + +### Google Gemini + +Environment variable: `GEMINI_API_KEY` + +Typical model prefixes: `gemini/...` + +How to get it: + +1. Create or sign in to your [Google AI Studio account](https://aistudio.google.com/). +2. Generate an API key from the [Gemini API key page](https://aistudio.google.com/app/apikey). +3. Store it in `.env` as `GEMINI_API_KEY`. + +```bash +GEMINI_API_KEY=your_gemini_api_key +``` + +### Anthropic + +Environment variable: `ANTHROPIC_API_KEY` + +Typical model prefixes: `anthropic/...` + +How to get it: + +1. Create or sign in to your [Anthropic Console account](https://console.anthropic.com/). +2. Create a new API key from the [Anthropic keys page](https://console.anthropic.com/settings/keys). +3. Store it in `.env`. + +```bash +ANTHROPIC_API_KEY=your_anthropic_api_key +``` + +### DeepSeek + +Environment variable: `DEEPSEEK_API_KEY` + +Typical model prefixes: `deepseek/...` + +How to get it: + +1. Create or sign in to your [DeepSeek platform account](https://platform.deepseek.com/). +2. Generate an API key from the [DeepSeek API keys page](https://platform.deepseek.com/api_keys). +3. Store it in `.env`. + +```bash +DEEPSEEK_API_KEY=your_deepseek_api_key +``` + +### OpenRouter + +Environment variable: `OPENROUTER_API_KEY` + +Typical model prefixes: `openrouter/...` + +How to get it: + +1. Create or sign in to your [OpenRouter account](https://openrouter.ai/). +2. Generate an API key from the [OpenRouter keys page](https://openrouter.ai/keys). +3. Store it in `.env`. + +```bash +OPENROUTER_API_KEY=your_openrouter_api_key +``` + +### Together AI + +Environment variable: `TOGETHER_API_KEY` + +Typical model prefixes: `together/...` + +How to get it: + +1. Create or sign in to your [Together AI account](https://www.together.ai/). +2. Generate an API key from the [Together AI API keys page](https://api.together.ai/settings/api-keys). +3. Store it in `.env`. + +```bash +TOGETHER_API_KEY=your_together_api_key +``` + +### Cohere + +Environment variable: `COHERE_API_KEY` + +Typical model prefixes: `cohere/...` + +How to get it: + +1. Create or sign in to your [Cohere account](https://dashboard.cohere.com/). +2. Create an API key from the [Cohere API keys page](https://dashboard.cohere.com/api-keys). +3. Store it in `.env`. + +```bash +COHERE_API_KEY=your_cohere_api_key +``` + +### Fireworks AI + +Environment variable: `FIREWORKS_API_KEY` + +Typical model prefixes: `fireworks/...` + +How to get it: + +1. Create or sign in to your [Fireworks AI account](https://fireworks.ai/). +2. Generate an API key from the [Fireworks AI API keys page](https://app.fireworks.ai/settings/users/api-keys). +3. Store it in `.env`. + +```bash +FIREWORKS_API_KEY=your_fireworks_api_key +``` + +### Ollama + +Environment variable: none required + +Used for: + +- Local model inference + +How to set it up: + +1. Install Ollama from the [main Ollama website](https://ollama.com/). +2. Pull the model you want to use by following the [Ollama library and setup docs](https://ollama.com/library). +3. Set `base_url` or `rag_base_url` if needed, such as `http://localhost:11434`. + +## Default Embedding Provider + +### Hugging Face + +Environment variable: `HF_TOKEN` + +> **Optional.** Only required for downloading gated or private Hugging Face models. Public models work without a token. + +Used for: + +- Accessing gated or private Hugging Face models +- Rate-limited API access + +How to get it: + +1. Create or sign in to your [Hugging Face account](https://huggingface.co/). +2. Open the [access tokens page](https://huggingface.co/settings/tokens). +3. Create a new token with the required permissions. +4. Store it in `.env`. + +```bash +HF_TOKEN=your_huggingface_token +``` + +## Recommended `.env` Template + +Use the subset you need: + +```bash +# Publisher providers +SCOPUS_API_KEY=your_scopus_api_key +SPRINGER_OPENACCESS_API_KEY=your_springer_openaccess_api_key +SPRINGER_TDM_API_KEY=your_springer_tdm_api_key +WILEY_API_KEY=your_wiley_api_key +IOP_papers_path=/absolute/path/to/iop_papers + +# LLM providers +OPENAI_API_KEY=your_openai_api_key +GEMINI_API_KEY=your_gemini_api_key +ANTHROPIC_API_KEY=your_anthropic_api_key +DEEPSEEK_API_KEY=your_deepseek_api_key +OPENROUTER_API_KEY=your_openrouter_api_key +TOGETHER_API_KEY=your_together_api_key +COHERE_API_KEY=your_cohere_api_key +FIREWORKS_API_KEY=your_fireworks_api_key + +# Model and embedding access +HF_TOKEN=your_huggingface_token +``` + +## Notes + +- Keep all keys in your local `.env` file and never commit them to version control. +- For most users, the minimum setup is one publisher source plus one LLM provider. +- If you use Gemini models, use `GEMINI_API_KEY`. +- If you use the default embedding setup, make sure `HF_TOKEN` is available. + +## Related Pages + +- [Installation](installation.md) +- [Article Processing](../usage/article-processing.md) +- [Data Extraction](../usage/data-extraction.md) +- [RAG Configuration](../rag-config.md) diff --git a/docs/rag-config.md b/docs/rag-config.md index 9b843481..52376775 100644 --- a/docs/rag-config.md +++ b/docs/rag-config.md @@ -123,7 +123,7 @@ scanner.extract_composition_property_data( scanner.extract_composition_property_data( main_extraction_keyword="d33", rag_db_path="embeddings/piezo", - rag_chat_model="deepseek-chat", + rag_chat_model="deepseek/deepseek-chat", rag_max_tokens=1024, rag_top_k=4, ) @@ -178,7 +178,7 @@ scanner.extract_composition_property_data( scanner.extract_composition_property_data( main_extraction_keyword="d33", rag_db_path="embeddings/piezo", - rag_chat_model="together_ai/meta-llama/Llama-3-70b-chat-hf", + rag_chat_model="together/meta-llama/Llama-3-70b-chat-hf", rag_max_tokens=1024, rag_top_k=4, ) @@ -220,7 +220,7 @@ scanner.extract_composition_property_data( scanner.extract_composition_property_data( main_extraction_keyword="d33", rag_db_path="embeddings/piezo", - rag_chat_model="fireworks_ai/accounts/fireworks/models/llama-v3-8b-instruct", + rag_chat_model="fireworks/models/llama-v3-8b-instruct", rag_max_tokens=1024, rag_top_k=4, ) diff --git a/pyproject.toml b/pyproject.toml index 3c6085be..3c07ded7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "comproscanner" -version = "0.1.5" +version = "0.1.6" description = "Multi-agent system for extracting and processing structured composition-property data from scientific literature" readme = "README.md" authors = [{ name = "Aritra Roy", email = "contact@aritraroy.live" }] diff --git a/src/comproscanner/extract_flow/tools/rag_tool.py b/src/comproscanner/extract_flow/tools/rag_tool.py index deff690d..b8ee8220 100644 --- a/src/comproscanner/extract_flow/tools/rag_tool.py +++ b/src/comproscanner/extract_flow/tools/rag_tool.py @@ -82,28 +82,28 @@ def _get_llm(self) -> BaseChatModel: "callbacks": callbacks, } # OpenAI models - if model.startswith(("gpt-", "text-", "o1", "o3")): + if model.startswith(("openai/", "gpt-", "text-", "o1", "o3")): self._check_package_exists("langchain_openai", model) from langchain_openai import ChatOpenAI return ChatOpenAI(model=model, request_timeout=1000, **common_params) # Deepseek models - if model.startswith("deepseek"): + if model.startswith("deepseek/"): self._check_package_exists("langchain_deepseek", model) from langchain_deepseek import ChatDeepSeek return ChatDeepSeek(model=model, request_timeout=1000, **common_params) # Google Gemini models - elif model.startswith("gemini-"): + elif model.startswith("gemini/"): self._check_package_exists("langchain_google_genai", model) from langchain_google_genai import ChatGoogleGenerativeAI return ChatGoogleGenerativeAI(model=model, **common_params) # Anthropic Claude models - elif model.startswith("claude-"): + elif model.startswith("claude/"): self._check_package_exists("langchain_anthropic", model) from langchain_anthropic import ChatAnthropic @@ -143,7 +143,7 @@ def _get_llm(self) -> BaseChatModel: return ChatCohere(model=model_name, **common_params) # Fireworks models - elif model.startswith(("fireworks/", "accounts/fireworks")): + elif model.startswith(("fireworks/")): self._check_package_exists("langchain_fireworks", model) from langchain_fireworks import ChatFireworks