diff --git a/eu_fact_force/dash-app/app.py b/eu_fact_force/dash-app/app.py index a2d3cc5..345bf65 100644 --- a/eu_fact_force/dash-app/app.py +++ b/eu_fact_force/dash-app/app.py @@ -1,14 +1,18 @@ -from dash import Dash, dcc, html -from dash.dependencies import Input, Output, State +from dash import Dash, dcc, html, Input, Output, State, ALL, ctx, no_update from dash.exceptions import PreventUpdate import dash_bootstrap_components as dbc + import plotly.io as pio import plotly.graph_objects as go +import base64 +import io import json +import uuid from utils.colors import EUPHAColors from utils.graph import RandomGraphGenerator +from utils.parsing import extract_pdf_metadata from pages import readme, ingest, graph # Plotly template @@ -251,6 +255,156 @@ def toggle_offcanvas(node_data, is_open): ### Create here callbacks for ingestions +@app.callback( + Output('input-doi', 'value'), + Output('input-abstract', 'value'), + Output('input-journal', 'value'), + Output('input-date', 'value'), + Output('input-link', 'value'), + Output('input-title', 'value'), + Output('session-store', 'data'), + Input('upload-pdf', 'contents') +) +def handle_pdf_upload(contents): + + if contents is None: + return no_update, no_update, no_update, no_update, no_update, no_update, {} + + # decoding of passed PDFs + content_type, content_string = contents.split(',') + decoded = base64.b64decode(content_string) + + # extract_pdf_metadata call + metadata = extract_pdf_metadata(io.BytesIO(decoded)) + + return ( + metadata.get('doi', ''), + metadata.get('abstract', ''), + metadata.get('journal', ''), + metadata.get('publication_date', ''), + metadata.get('article_link', ''), + metadata.get('title', ''), + metadata + ) +@app.callback( + Output('authors-container', 'children'), + Input('btn-add-author', 'n_clicks'), + Input({'type': 'remove-author', 'index': ALL}, 'n_clicks'), + Input('session-store', 'data'), + State({'type': 'auth-name', 'index': ALL}, 'value'), + State({'type': 'auth-surname', 'index': ALL}, 'value'), + State({'type': 'auth-email', 'index': ALL}, 'value'), + State({'type': 'auth-name', 'index': ALL}, 'id'), +) +def update_authors_list(add_clicks, remove_clicks, metadata, names, surnames, emails, ids): + triggered = ctx.triggered_id + + # on a new pdf uplaod + if triggered == 'session-store' and metadata: + authors = metadata.get('authors', []) + return [ingest.add_author_line(str(uuid.uuid4()), a.get('name', ''), a.get('surname', ''), a.get('email', '')) for a in authors] + + # reconstructing authors list + current_authors = [] + if ids: + for idx_id, name, surname, email in zip(ids, names, surnames, emails): + current_authors.append({ + 'index': idx_id['index'], + 'name': name or "", + 'surname': surname or "", + 'email': email or "" + }) + + # if missing author + if triggered == 'btn-add-author': + current_authors.append({ + 'index': str(uuid.uuid4()), + 'name': "", + 'surname': "", + 'email': "" + }) + + # remove blank/irrelevant author field + if isinstance(triggered, dict) and triggered.get('type') == 'remove-author': + remove_index = triggered.get('index') + current_authors = [a for a in current_authors if a['index'] != remove_index] + + return [ingest.add_author_line(a['index'], a['name'], a['surname'], a['email']) for a in current_authors] + + +@app.callback( + Output('input-doi', 'disabled'), + Output('input-abstract', 'disabled'), + Output('input-journal', 'disabled'), + Output('input-date', 'disabled'), + Output('input-link', 'disabled'), + Output('input-category', 'disabled'), + Output('input-type', 'disabled'), + Output('input-title', 'disabled'), + Input('chk-meta-correct', 'value') +) +def lock_metadata(is_correct): + val = bool(is_correct) + return val, val, val, val, val, val, val, val + + +@app.callback( + Output({'type': 'auth-name', 'index': ALL}, 'disabled'), + Output({'type': 'auth-surname', 'index': ALL}, 'disabled'), + Output({'type': 'auth-email', 'index': ALL}, 'disabled'), + Output({'type': 'remove-author', 'index': ALL}, 'disabled'), + Output('btn-add-author', 'disabled'), + Input('chk-authors-correct', 'value'), + State({'type': 'auth-name', 'index': ALL}, 'id') +) +def lock_authors(is_correct, ids): + is_corr = bool(is_correct) + if not ids: + return [], [], [], [], is_corr + length = len(ids) + return [is_corr]*length, [is_corr]*length, [is_corr]*length, [is_corr]*length, is_corr + + +@app.callback( + Output('final-output', 'children'), + Input('btn-final-upload', 'n_clicks'), + State('input-doi', 'value'), + State('input-abstract', 'value'), + State('input-journal', 'value'), + State('input-date', 'value'), + State('input-link', 'value'), + State('input-category', 'value'), + State('input-type', 'value'), + State('input-title', 'value'), + State({'type': 'auth-name', 'index': ALL}, 'value'), + State({'type': 'auth-surname', 'index': ALL}, 'value'), + State({'type': 'auth-email', 'index': ALL}, 'value'), + prevent_initial_call=True +) +def finalize_and_display_json(n_clicks, doi, abstract, journal, date, link, category, study_type, title, names, surnames, emails): + + authors_list = [ + {"name": n, "surname": s, "email": e} + for n, s, e in zip(names, surnames, emails) if n or s + ] + + metadata_json = { + "title": title, + "category": category, + "study_type": study_type, + "journal": journal, + "publication_year": date, + "doi": doi, + "article_link": link, + "abstract": abstract, + "authors": authors_list + } + + return html.Div([ + dbc.Alert("Successfully contributed, thank you!", color="success"), + html.H4("Metadata JSON"), + html.Pre(json.dumps(metadata_json, indent=4), style={'backgroundColor': '#f8f9fa', 'padding': '15px', 'borderRadius': '8px', 'border': '1px solid #dee2e6'}) + ]) if __name__ == "__main__": app.run(debug=True) diff --git a/eu_fact_force/dash-app/pages/ingest.py b/eu_fact_force/dash-app/pages/ingest.py index 40ba966..4bc34a0 100644 --- a/eu_fact_force/dash-app/pages/ingest.py +++ b/eu_fact_force/dash-app/pages/ingest.py @@ -1,18 +1,304 @@ from dash import dcc, html +import dash_bootstrap_components as dbc from utils.colors import EUPHAColors - def make_layout(): - return html.Div( + #Sidebar + sidebar = html.Div( [ - html.H2("Ingestion"), - dcc.Markdown("Ingestion layout to be completed here..."), + html.Div( + [ + + html.H3( + "EU Fact Force", + className="text-center", + style={ + "fontWeight": "700", + "fontSize": "1.9rem", + "marginBottom": "20px", + "color": EUPHAColors.dark_blue + } + ), + + html.Hr(style={"margin": "1.2rem 0"}), + + html.H5( + "How it works", + style={ + "fontWeight": "500", + "marginBottom": "12px", + "marginTop": "45px" + } + ), + + html.Ol( + [ + html.Li("Upload a PDF"), + html.Li("Validate DOI + abstract"), + html.Li("Validate authors"), + html.Li("Click Upload file") + ], + style={ + "paddingLeft": "1.2rem", + "marginLeft": "0", + "lineHeight": "1.8" + } + ), + ], + style={ + "maxWidth": "240px", + "margin": "0 auto" + } + ) ], style={ - "border-radius": "15px", - "padding": "20px", - "background-color": EUPHAColors.white, - }, + "padding": "2rem 1rem", + "backgroundColor": EUPHAColors.white, + "height": "100vh", + "position": "fixed", + "top": 0, + "left": 0, + "width": "16%", + "borderRight": "1px solid #dee2e6" + } ) + + # Main page + main_content = html.Div( + [ + html.Div( + [ + html.H1( + "EU Fact Force - Article uploading page", + className="mb-3 text-center", + style={ + "fontWeight": "700", + "fontSize": "2.5rem", + "lineHeight": "1.15" + } + ), + html.H3( + "Welcome to EU Fact Force articles uploading pages", + className="text-center mb-4", + style={ + "color": EUPHAColors.black, + "fontWeight": "500", + "fontSize": "1.5rem", + "lineHeight": "1.3" + } + ), + html.P( + "Thank you for collaborating with us, you will find here a page where you can upload and declare authors of your papers in attempt to build a safer and healthier community! Thank you for your contribution!", + className="text-center mb-5", + style={ + "maxWidth": "900px", + "margin": "0 auto", + "fontSize": "1.1rem", + "lineHeight": "1.7", + "color": EUPHAColors.black + } + ), + ], + style={ + "maxWidth": "1100px", + "margin": "0 auto 2rem auto" + } + ), + + dbc.Card([ + dbc.CardBody([ + html.H4( + "Upload & Metadatas", + className="card-title font-weight-bold mb-4" + ), + dcc.Upload( + id='upload-pdf', + children=html.Div(['Drop your article here or ', html.A('Select a PDF', className="font-weight-bold")]), + style={ + 'width': '100%', + 'height': '80px', + 'lineHeight': '80px', + 'borderWidth': '2px', + 'borderStyle': 'dashed', + 'borderColor': EUPHAColors.dark_blue, + 'textAlign': 'center', + 'borderRadius': '10px', + 'marginBottom': '20px', + 'backgroundColor': EUPHAColors.white, + 'cursor': 'pointer' + } + ), + html.H5("General informations", className="mt-4 font-weight-bold"), + dbc.Row([ + dbc.Col([ + dbc.Label("Article Title"), + dbc.Input(id='input-title', type='text', placeholder="Title of the article", className="mb-3"), + + dbc.Row([ + dbc.Col([ + dbc.Label("Category"), + dcc.Dropdown( + id='input-category', + options=[ + {'label': 'Scientific Article', 'value': 'scientific_article'}, + {'label': 'Report', 'value': 'report'}, + {'label': 'Thesis', 'value': 'thesis'}, + {'label': 'Working Paper', 'value': 'working_paper'}, + {'label': 'Book Chapter', 'value': 'book_chapter'}, + {'label': 'Other', 'value': 'other'} + ], + value='scientific_article', + className="mb-3" + ), + ], width=6), + dbc.Col([ + dbc.Label("Study Type"), + dcc.Dropdown( + id='input-type', + options=[ + {'label': 'Meta-analysis', 'value': 'meta_analysis'}, + {'label': 'Systematic review', 'value': 'systematic_review'}, + {'label': 'Evidence review', 'value': 'evidence_review'}, + {'label': 'Cohort study', 'value': 'cohort_study'}, + {'label': 'Case-control study', 'value': 'case_control_study'}, + {'label': 'Cross-sectional study', 'value': 'cross_sectional_study'}, + {'label': 'Randomized controlled trial', 'value': 'rct'}, + {'label': 'Other', 'value': 'other'} + ], + className="mb-3" + ), + ], width=6), + ]), + dbc.Label("Journal / Source"), + dbc.Input(id='input-journal', type='text', placeholder="ex: The Lancet Public Health", className="mb-3"), + + dbc.Row([ + dbc.Col([ + dbc.Label("Publication Year"), + dbc.Input(id='input-date', type='text', placeholder="ex: 2023"), + ], width=6), + dbc.Col([ + dbc.Label("DOI"), + dbc.Input(id='input-doi', type='text', placeholder="ex: 10.1038/s41586-021-00000-x"), + ], width=6), + ], className="mb-3"), + + dbc.Label("Publication URL"), + dbc.Input(id='input-link', type='text', placeholder="https://pubmed.ncbi.nlm.nih.gov/...", className="mb-3"), + + dbc.Label("Abstract"), + dbc.Textarea(id='input-abstract', style={'height': 150}, placeholder="Lorem ipsum dolor sit amet"), + + dbc.Checkbox(id='chk-meta-correct', label="This information is correct", className="mt-3 font-weight-bold text-success"), + ], width=12) + ]), + ]) + ], className="mb-4 shadow-sm", style={"borderRadius": "16px"}), + + dbc.Card([ + dbc.CardBody([ + html.H4( + "Authors", + className="card-title font-weight-bold mb-4" + ), + html.Div(id='authors-container'), + dbc.Button( + "➕ Add an author", + id='btn-add-author', + n_clicks=0, + outline=True, + className="mt-3", + style={ + "color": "#3B6096", + "borderColor": "#3B6096", + "borderRadius": "10px", + "fontWeight": "500" + } + ), + html.Br(), + dbc.Checkbox(id='chk-authors-correct', label="Authors information is correct", className="mt-3 font-weight-bold text-success"), + ]) + ], className="mb-4 shadow-sm", style={"borderRadius": "16px"}), + + dbc.Button( + "Upload file", + id='btn-final-upload', + size="lg", + className="w-100 mb-4", + style={ + "backgroundColor": EUPHAColors.dark_blue, + "borderColor": EUPHAColors.dark_blue, + "color": "white", + "fontWeight": "600", + "borderRadius": "10px" + } + ), + + html.Div(id='final-output', className="mt-4 pb-5") + ], + style={ + "marginLeft": "16%", + "padding": "5rem 1.5rem 2rem 1.5rem", + "width": "84%", + "backgroundColor": "#ffffff" + } + ) + + return html.Div([ + dcc.Store(id='session-store', data={}), + sidebar, main_content], + style={"fontFamily": "system-ui, -apple-system, sans-serif", + "backgroundColor": "#f5f7fa"}) + + + + +def add_author_line(index, name="", surname="", email=""): + """One-click addition/suppression of a new author line""" + + return dbc.Card([ + dbc.CardBody([ + dbc.Row([ + dbc.Col( + dbc.Input( + id={'type': 'auth-name', 'index': index}, + value=name, + placeholder="Name" + ), + width=3 + ), + dbc.Col( + dbc.Input( + id={'type': 'auth-surname', 'index': index}, + value=surname, + placeholder="Surname" + ), + width=3 + ), + dbc.Col( + dbc.Input( + id={'type': 'auth-email', 'index': index}, + value=email, + placeholder="Email (Corresponding)" + ), + width=4 + ), + dbc.Col( + dbc.Button( + "Remove", + id={'type': 'remove-author', 'index': index}, + color="danger", + outline=True, + className="w-100", + style={ + "whiteSpace": "nowrap", + "minWidth": "100px" + } + ), + width=2 + ) + ], className="align-items-center g-2") + ], className="p-2") + ], className="mb-3 border-light shadow-sm") diff --git a/eu_fact_force/dash-app/utils/parsing.py b/eu_fact_force/dash-app/utils/parsing.py new file mode 100644 index 0000000..3300782 --- /dev/null +++ b/eu_fact_force/dash-app/utils/parsing.py @@ -0,0 +1,222 @@ +import base64 +from typing import Optional +import re +import fitz # PyMuPDF + +def load_png_as_data_uri(png_path: str) -> Optional[str]: + """Return a data URI for an PNG file, or None if not found.""" + try: + with open(png_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:image/png+xml;base64,{b64}" + except FileNotFoundError: + return None + +def extract_doi_from_pdf(text: str) -> Optional[str]: + """Extract DOI from PDF text using regex pattern.""" + # Pattern for DOI: 10.xxxx/xxxxx + match = re.search(r'(?:doi[:\s]+)?(?:https?://)?(?:dx\.)?doi\.org/(10\.\S+)', text, re.IGNORECASE) + if match: + return match.group(1) if match.group(1).startswith('10.') else match.group(0) + + # Alternative pattern + match = re.search(r'10\.\d{4,}/\S+', text) + if match: + return match.group(0) + return None + + +def extract_abstract_from_pdf(text: str) -> Optional[str]: + """Extract abstract from PDF text.""" + # Look for "Abstract" section + abstract_pattern = r'(?:abstract|summary)\s*[:]*\s*(.+?)(?=(?:introduction|keywords|1\.\s|methods|methodology|introduction|related work|background)|\Z)' + match = re.search(abstract_pattern, text, re.IGNORECASE | re.DOTALL) + if match: + abstract_text = match.group(1).strip() + # Clean up and limit to reasonable length + abstract_text = re.sub(r'\s+', ' ', abstract_text)[:500] + return abstract_text if len(abstract_text) > 20 else None + return None + + +def extract_authors_from_pdf(text: str) -> list[dict]: + """Extract authors by finding the typical author line in scientific papers.""" + authors = [] + + def clean_name(name: str) -> str: + # Supprime chiffres, *, †, § collés au nom + return re.sub(r'[\d\*†‡§]+', '', name).strip() + + lines = text.split('\n')[:50] + + for line in lines: + line = line.strip() + + # Une ligne d'auteurs contient typiquement "and" ou une virgule + # et ressemble à des noms propres (Majuscule, pas trop longue) + if len(line) > 150 or len(line) < 5: + continue + if not re.search(r'\band\b|,', line): + continue + # Doit commencer par une majuscule + if not re.match(r'^[A-Z]', line): + continue + # Ne doit pas contenir de mots typiques de non-auteurs + skip_words = ['abstract', 'keywords', 'introduction', 'figure', + 'table', 'doi', 'http', 'university', 'institute', + 'open access', 'copyright', 'license', 'received'] + if any(w in line.lower() for w in skip_words): + continue + # Tous les "mots" (après nettoyage) doivent ressembler à des noms propres + # càd commencer par une majuscule ou être un chiffre/symbole + test_line = clean_name(line) + words = [w for w in re.split(r'[\s,]+', test_line) if w] + if not words: + continue + # Au moins 80% des mots doivent commencer par une majuscule + capitalized = sum(1 for w in words if re.match(r'^[A-Z]', w) or w.lower() == 'and') + if capitalized / len(words) < 0.8: + continue + + # C'est probablement une ligne d'auteurs — on parse + raw_names = re.split(r',\s*|\s+and\s+', line) + for raw in raw_names: + name = clean_name(raw).strip() + if not name or len(name) < 3: + continue + parts = name.split() + if len(parts) >= 2: + authors.append({ + "name": " ".join(parts[:-1]), + "surname": parts[-1], + "email": "" + }) + + # Rattache l'email du corresponding author + corr_match = re.search( + r'\*Correspondence[:\s]+([A-Z][a-z]+(?:[\s\-][A-Za-z\-]+)+)\s+([\w.\-]+@[\w.\-]+\.\w+)', + text + ) + if corr_match and authors: + corr_name = re.sub(r'[\d\*†‡§]+', '', corr_match.group(1)).strip() + corr_email = corr_match.group(2) + for author in authors: + full = f"{author['name']} {author['surname']}" + if corr_name in full or full in corr_name: + author['email'] = corr_email + + return authors[:10] + +def extract_date_from_pdf(text: str) -> Optional[str]: + """Extract publication date (year only) from PDF text.""" + # YYYY-MM-DD or YYYY/MM/DD + match = re.search(r'\b((?:19|20)\d{2})[-/.](?:0[1-9]|1[012])[-/.](?:0[1-9]|[12][0-9]|3[01])\b', text) + if match: + return match.group(1) + + # DD-MM-YYYY or DD/MM/YYYY + match = re.search(r'\b(?:0[1-9]|[12][0-9]|3[01])[-/.](?:0[1-9]|1[012])[-/.]((?:19|20)\d{2})\b', text) + + # Pattern: Month Year (e.g., "January 2023") + match = re.search(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+((?:19|20)\d{2})\b', text, re.IGNORECASE) + if match: + return match.group(1) + + # We look for years typically appearing in headers or near "Copyright" or "Received" + # Just a year (between 1900 and 2099) + match = re.search(r'\b(19\d{2}|20\d{2})\b', text) + if match: + return match.group(1) + + return None + +def extract_journal_from_pdf(text: str) -> Optional[str]: + """Extract journal name from PDF text.""" + journal_patterns = [ + r'Published in\s*[:]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', + r'Journal of\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)', + r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\s+Journal)', + r'Source\s*[:]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)' + ] + for pattern in journal_patterns: + match = re.search(pattern, text) + if match: + return match.group(1).strip() + + # Try to find it in the first few lines if not found by pattern + lines = text.split('\n')[:15] + for line in lines: + line = line.strip() + if any(kw in line for kw in ["Journal", "Review", "Nature", "Science", "Lancet", "Medicine"]): + if len(line.split()) < 10: # Avoid long sentences + return line + return None + +def extract_link_from_pdf(text: str, doi: Optional[str] = None) -> Optional[str]: + """Extract article link from PDF text or DOI.""" + if doi: + return f"https://doi.org/{doi}" + + # Look for https links that might be the editor's link + links = re.findall(r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)', text) + for link in links: + if any(domain in link for domain in ['sciencedirect', 'springer', 'wiley', 'nature.com', 'thelancet', 'bmj', 'frontiersin', 'plos', 'pubmed.ncbi.nlm.nih.gov'\ + , 'who.int', 'cdc.gov', 'acpjournals', 'nejm.org', 'jama.jamanetwork.com']): + return link + return links[0] if links else None + +def extract_text_by_blocks(uploaded_file_bytes) -> str: + doc = fitz.open(stream=uploaded_file_bytes, filetype="pdf") + full_text = "" + for page in doc[:3]: + # Trie les blocs par position verticale puis horizontale + blocks = page.get_text("blocks") + blocks.sort(key=lambda b: (round(b[1] / 20), b[0])) + for block in blocks: + full_text += block[4] + "\n" + return full_text + +def extract_title_from_pdf(text: str) -> Optional[str]: + """Try to extract the title from the first few lines of the PDF.""" + lines = [line.strip() for line in text.split('\n') if line.strip()] + if not lines: + return None + + # Typically the title is in the first few lines, is not too long, + # and doesn't contain certain keywords. + for line in lines[:10]: + # Skip lines that are likely not titles (e.g., journal names, DOI, authors) + if any(kw in line.lower() for kw in ["journal", "doi:", "http", "vol.", "issn", "received:", "accepted:", "copyright"]): + continue + # Titles are usually at least 3 words and not excessively long (e.g. < 250 chars) + if 3 <= len(line.split()) <= 40 and len(line) < 450: + return line + return None + +def extract_pdf_metadata(uploaded_file) -> dict: + """Extract metadata from PDF file.""" + metadata = { + "title": None, + "doi": None, + "abstract": None, + "publication_date": None, + "journal": None, + "article_link": None, + "authors": [] + } + try: + # Extract text from PDF + pdf_text = extract_text_by_blocks(uploaded_file.read()) + + # Extract metadata + metadata["title"] = extract_title_from_pdf(pdf_text) + metadata["doi"] = extract_doi_from_pdf(pdf_text) + metadata["abstract"] = extract_abstract_from_pdf(pdf_text) + metadata["authors"] = extract_authors_from_pdf(pdf_text) + metadata["publication_date"] = extract_date_from_pdf(pdf_text) + metadata["journal"] = extract_journal_from_pdf(pdf_text) + metadata["article_link"] = extract_link_from_pdf(pdf_text, metadata["doi"]) + + except Exception as e: + print(f"Error processing PDF: {e}") + return metadata