Notebooks#137
Conversation
|
Check out this pull request on See visual diffs & provide feedback on Jupyter Notebooks. Powered by ReviewNB |
pgronkievitz
left a comment
There was a problem hiding this comment.
didn't check notebooks yet
There was a problem hiding this comment.
question: why does this require separate gitignore?
|
|
||
| article = Article("") | ||
|
|
||
| def __init__(self, url_list: Union[List[str], str], depth: int = 1): |
There was a problem hiding this comment.
nitpick: use modern typing syntax
| def __init__(self, url_list: Union[List[str], str], depth: int = 1): | |
| def __init__(self, url_list: list[str] | str, depth: int = 1): |
| self.depth = depth | ||
|
|
||
| @staticmethod | ||
| def newspaper_extractor(html): |
| if text_splitter is None: | ||
| _text_splitter: TextSplitter = SpacyTextSplitter( | ||
| pipeline="pl_core_news_sm", | ||
| chunk_size=chunk, | ||
| chunk_overlap=chunk_overlap, | ||
| ) | ||
| else: | ||
| _text_splitter = text_splitter |
There was a problem hiding this comment.
suggestion:
| if text_splitter is None: | |
| _text_splitter: TextSplitter = SpacyTextSplitter( | |
| pipeline="pl_core_news_sm", | |
| chunk_size=chunk, | |
| chunk_overlap=chunk_overlap, | |
| ) | |
| else: | |
| _text_splitter = text_splitter | |
| _text_splitter = text_splitter or SpacyTextSplitter( | |
| pipeline="pl_core_news_sm", | |
| chunk_size=chunk, | |
| chunk_overlap=chunk_overlap, | |
| ) |
| docs = self.load() | ||
| docs = reduce( | ||
| lambda data, method: method(data), | ||
| [CleanWebLoader.junk_remover, CleanWebLoader.ds_converter], | ||
| docs, | ||
| ) |
There was a problem hiding this comment.
suggestion:
| docs = self.load() | |
| docs = reduce( | |
| lambda data, method: method(data), | |
| [CleanWebLoader.junk_remover, CleanWebLoader.ds_converter], | |
| docs, | |
| ) | |
| docs = reduce( | |
| lambda data, method: method(data), | |
| [CleanWebLoader.junk_remover, CleanWebLoader.ds_converter], | |
| self.load(), | |
| ) |
| loader = LocalDataLoader.loaders[d_type](file_path) | ||
| try: | ||
| docs.append(loader.load()[0]) | ||
| except Exception as e: |
There was a problem hiding this comment.
issue: use specific exceptions, not generic Exception, as it'll silently fail on exception you didn't forsee
|
|
||
| [tool.poetry.dev-dependencies] | ||
| black = "*" | ||
| flake8 = "*" No newline at end of file |
| requires = ["poetry-core"] | ||
| build-backend = "poetry.core.masonry.api" | ||
|
|
||
| [tool.poetry.dev-dependencies] |
There was a problem hiding this comment.
issue: legacy syntax, don't use it
| black = "*" | ||
| flake8 = "*" No newline at end of file |
There was a problem hiding this comment.
issue: be more specific about required versions
| docs.extend(local_loader.load()) | ||
|
|
||
| for doc in docs: | ||
| requests.post(url, json=doc) No newline at end of file |
NewBlueWizard
left a comment
There was a problem hiding this comment.
Fixed the code as suggested. Fixing exceptions and module versions require updating and testing the current code.
pgronkievitz
left a comment
There was a problem hiding this comment.
please, use black, ruff (with as much enabled options as possible) and pyright with strict typing
|
|
||
| """ | ||
|
|
||
| article = Article("") |
There was a problem hiding this comment.
nitpick: are you aware this instance will be shared and may cause weird bugs with overwrites between objects? (this object is created once and will be shared between CleanWebLoader instances)
| @staticmethod | ||
| def newspaper_extractor(html: str): | ||
| """ | ||
| Extracts and cleans text content from HTML using the 'newspaper' library. | ||
|
|
||
| :param html: HTML content to be processed. | ||
| :return: Cleaned and concatenated text extracted from the HTML. | ||
| """ | ||
| CleanWebLoader.article.set_html(html) | ||
| CleanWebLoader.article.parse() | ||
| return " ".join(CleanWebLoader.article.text.split()) | ||
|
|
||
| @staticmethod | ||
| def ds_converter(docs: list[Document]): | ||
| """ | ||
| Converts a list of documents into a specific data structure. | ||
|
|
||
| :param docs: List of documents to be converted. | ||
| :return: List of dictionaries, each representing a document with 'text' key. | ||
| """ | ||
| return [{"text": doc.page_content} for doc in docs] | ||
|
|
||
| @staticmethod | ||
| def junk_remover(docs: list[Document]): | ||
| """ | ||
| Identifies and returns a list of suspected junk documents based on specific criteria. | ||
|
|
||
| :param docs: A list of documents, where each document is represented as a dictionary. | ||
| Each dictionary should have a "text" key containing the text content of the document. | ||
| :return: A list of suspected junk documents based on the criteria of having less than 300 characters | ||
| or having the same text as another document in the input list. | ||
| """ | ||
| junk_docs = [doc for doc in docs if len(doc.page_content) < 300] | ||
| seen_texts = set() | ||
| clear_docs = [] | ||
| for doc in docs: | ||
| if "title" not in doc.metadata.keys(): | ||
| junk_docs.append(doc) | ||
| elif doc.page_content not in seen_texts and doc not in junk_docs: | ||
| clear_docs.append(doc) | ||
| seen_texts.add(doc.page_content) | ||
| else: | ||
| pass | ||
| return clear_docs |
There was a problem hiding this comment.
issue: those should not be static, add self argument and use self.article.* instead of CleanWebLoader.article.*
ds_converter and junk_remover should be separate from this class as they've got nothing to do with it.
also - TYPING, please check with pyright set to strict
| @@ -0,0 +1,140 @@ | |||
| from newspaper import Article | |||
| from functools import reduce | |||
| from typing import List, Optional | |||
There was a problem hiding this comment.
nitpick: both are unnecessary - list[type] and type | None work just fine
| :param chunk_overlap: Overlap size between chunks (default is 80). | ||
| :return: List of dictionaries, each representing a document with 'text' key. | ||
| """ | ||
| _text_splitter: text_splitter or TextSplitter = SpacyTextSplitter( |
There was a problem hiding this comment.
issue: that's some weird-ass typing, wtf
| loaders = { | ||
| ".pdf": PyPDFLoader, | ||
| ".json": JSONLoader, | ||
| ".txt": TextLoader, | ||
| ".csv": CSVLoader, | ||
| } |
There was a problem hiding this comment.
nitpick: move this to init
| ".csv": CSVLoader, | ||
| } | ||
|
|
||
| def __init__(self, path: Union[List[str], str]): |
There was a problem hiding this comment.
suggestion:
| def __init__(self, path: Union[List[str], str]): | |
| def __init__(self, path: list[str] | str): |
| @staticmethod | ||
| def ds_converter(docs): | ||
| """ | ||
| Converts a list of documents into a specific data structure. | ||
|
|
||
| :param docs: List of documents to be converted. | ||
| :return: List of dictionaries, each representing a document with 'text' and 'url' keys. | ||
| """ | ||
| return [{"text": doc.page_content} for doc in docs] | ||
|
|
||
| @staticmethod | ||
| def junk_remover(docs): | ||
| """ | ||
| Identifies and returns a list of suspected junk documents based on specific criteria. | ||
|
|
||
| :param docs: A list of documents, where each document is represented as a dictionary. | ||
| Each dictionary should have a "text" key containing the text content of the document. | ||
| :return: A list of suspected junk documents based on the criteria of having less than 300 characters | ||
| or having the same text as another document in the input list. | ||
| """ | ||
| junk_docs = [doc for doc in docs if len(doc.page_content) < 300] | ||
| seen_texts = {} | ||
| clear_docs = [] | ||
| for doc in docs: | ||
| if doc.page_content not in seen_texts and doc not in junk_docs: | ||
| clear_docs.append(doc) | ||
| seen_texts.add(doc.page_content) | ||
| return clear_docs |
There was a problem hiding this comment.
suggestion: yep, those should be shared, if you want them to be inside of this class - make them a mixin
| def test_ds_converter(clean_web_loader): | ||
| docs = [Document(page_content="Document 1"), Document(page_content="Document 2")] | ||
| expected_output = [{"text": "Document 1"}, {"text": "Document 2"}] | ||
| assert clean_web_loader.ds_converter(docs) == expected_output |
There was a problem hiding this comment.
issue: tests should be clean functions with no side effects, this will break once example.com changes (or this fixture is unnnecessary at all)
No description provided.