diff --git a/eu_fact_force/dash-app/app.py b/eu_fact_force/dash-app/app.py
index a2d3cc5..345bf65 100644
--- a/eu_fact_force/dash-app/app.py
+++ b/eu_fact_force/dash-app/app.py
@@ -1,14 +1,18 @@
-from dash import Dash, dcc, html
-from dash.dependencies import Input, Output, State
+from dash import Dash, dcc, html, Input, Output, State, ALL, ctx, no_update
 from dash.exceptions import PreventUpdate
 import dash_bootstrap_components as dbc
+
 import plotly.io as pio
 import plotly.graph_objects as go
 
+import base64
+import io
 import json
+import uuid
 
 from utils.colors import EUPHAColors
 from utils.graph import RandomGraphGenerator
+from utils.parsing import extract_pdf_metadata
 from pages import readme, ingest, graph
 
 # Plotly template
@@ -251,6 +255,156 @@ def toggle_offcanvas(node_data, is_open):
 
 ### Create here callbacks for ingestions
 
+@app.callback(
+    Output('input-doi', 'value'),
+    Output('input-abstract', 'value'),
+    Output('input-journal', 'value'),
+    Output('input-date', 'value'),
+    Output('input-link', 'value'),
+    Output('input-title', 'value'),
+    Output('session-store', 'data'),
+    Input('upload-pdf', 'contents')
+)
+def handle_pdf_upload(contents):
+
+    if contents is None:
+        return no_update, no_update, no_update, no_update, no_update, no_update, {}
+
+    # decoding of passed PDFs
+    content_type, content_string = contents.split(',')
+    decoded = base64.b64decode(content_string)
+
+    # extract_pdf_metadata call
+    metadata = extract_pdf_metadata(io.BytesIO(decoded))
+
+    return (
+        metadata.get('doi', ''),
+        metadata.get('abstract', ''),
+        metadata.get('journal', ''),
+        metadata.get('publication_date', ''),
+        metadata.get('article_link', ''),
+        metadata.get('title', ''),
+        metadata
+    )
+@app.callback(
+    Output('authors-container', 'children'),
+    Input('btn-add-author', 'n_clicks'),
+    Input({'type': 'remove-author', 'index': ALL}, 'n_clicks'),
+    Input('session-store', 'data'),
+    State({'type': 'auth-name', 'index': ALL}, 'value'),
+    State({'type': 'auth-surname', 'index': ALL}, 'value'),
+    State({'type': 'auth-email', 'index': ALL}, 'value'),
+    State({'type': 'auth-name', 'index': ALL}, 'id'),
+)
+def update_authors_list(add_clicks, remove_clicks, metadata, names, surnames, emails, ids):
+    triggered = ctx.triggered_id
+
+    # on a new pdf uplaod
+    if triggered == 'session-store' and metadata:
+        authors = metadata.get('authors', [])
+        return [ingest.add_author_line(str(uuid.uuid4()), a.get('name', ''), a.get('surname', ''), a.get('email', '')) for a in authors]
+
+    # reconstructing authors list
+    current_authors = []
+    if ids:
+        for idx_id, name, surname, email in zip(ids, names, surnames, emails):
+            current_authors.append({
+                'index': idx_id['index'],
+                'name': name or "",
+                'surname': surname or "",
+                'email': email or ""
+            })
+
+    # if missing author
+    if triggered == 'btn-add-author':
+        current_authors.append({
+            'index': str(uuid.uuid4()),
+            'name': "",
+            'surname': "",
+            'email': ""
+        })
+
+    # remove blank/irrelevant author field
+    if isinstance(triggered, dict) and triggered.get('type') == 'remove-author':
+        remove_index = triggered.get('index')
+        current_authors = [a for a in current_authors if a['index'] != remove_index]
+
+    return [ingest.add_author_line(a['index'], a['name'], a['surname'], a['email']) for a in current_authors]
+
+
+@app.callback(
+    Output('input-doi', 'disabled'),
+    Output('input-abstract', 'disabled'),
+    Output('input-journal', 'disabled'),
+    Output('input-date', 'disabled'),
+    Output('input-link', 'disabled'),
+    Output('input-category', 'disabled'),
+    Output('input-type', 'disabled'),
+    Output('input-title', 'disabled'),
+    Input('chk-meta-correct', 'value')
+)
+def lock_metadata(is_correct):
+    val = bool(is_correct)
+    return val, val, val, val, val, val, val, val
+
+
+@app.callback(
+    Output({'type': 'auth-name', 'index': ALL}, 'disabled'),
+    Output({'type': 'auth-surname', 'index': ALL}, 'disabled'),
+    Output({'type': 'auth-email', 'index': ALL}, 'disabled'),
+    Output({'type': 'remove-author', 'index': ALL}, 'disabled'),
+    Output('btn-add-author', 'disabled'),
+    Input('chk-authors-correct', 'value'),
+    State({'type': 'auth-name', 'index': ALL}, 'id')
+)
+def lock_authors(is_correct, ids):
+    is_corr = bool(is_correct)
+    if not ids:
+        return [], [], [], [], is_corr
+    length = len(ids)
+    return [is_corr]*length, [is_corr]*length, [is_corr]*length, [is_corr]*length, is_corr
+
+
+@app.callback(
+    Output('final-output', 'children'),
+    Input('btn-final-upload', 'n_clicks'),
+    State('input-doi', 'value'),
+    State('input-abstract', 'value'),
+    State('input-journal', 'value'),
+    State('input-date', 'value'),
+    State('input-link', 'value'),
+    State('input-category', 'value'),
+    State('input-type', 'value'),
+    State('input-title', 'value'),
+    State({'type': 'auth-name', 'index': ALL}, 'value'),
+    State({'type': 'auth-surname', 'index': ALL}, 'value'),
+    State({'type': 'auth-email', 'index': ALL}, 'value'),
+    prevent_initial_call=True
+)
+def finalize_and_display_json(n_clicks, doi, abstract, journal, date, link, category, study_type, title, names, surnames, emails):
+
+    authors_list = [
+        {"name": n, "surname": s, "email": e}
+        for n, s, e in zip(names, surnames, emails) if n or s
+    ]
+
+    metadata_json = {
+        "title": title,
+        "category": category,
+        "study_type": study_type,
+        "journal": journal,
+        "publication_year": date,
+        "doi": doi,
+        "article_link": link,
+        "abstract": abstract,
+        "authors": authors_list
+    }
+
+    return html.Div([
+        dbc.Alert("Successfully contributed, thank you!", color="success"),
+        html.H4("Metadata JSON"),
+        html.Pre(json.dumps(metadata_json, indent=4), style={'backgroundColor': '#f8f9fa', 'padding': '15px', 'borderRadius': '8px', 'border': '1px solid #dee2e6'})
+                    ])
 
 if __name__ == "__main__":
     app.run(debug=True)
diff --git a/eu_fact_force/dash-app/pages/ingest.py b/eu_fact_force/dash-app/pages/ingest.py
index 40ba966..4bc34a0 100644
--- a/eu_fact_force/dash-app/pages/ingest.py
+++ b/eu_fact_force/dash-app/pages/ingest.py
@@ -1,18 +1,304 @@
 from dash import dcc, html
+import dash_bootstrap_components as dbc
 
 from utils.colors import EUPHAColors
 
-
 def make_layout():
 
-    return html.Div(
+    #Sidebar
+    sidebar = html.Div(
         [
-            html.H2("Ingestion"),
-            dcc.Markdown("Ingestion layout to be completed here..."),
+            html.Div(
+                [
+
+                    html.H3(
+                        "EU Fact Force",
+                        className="text-center",
+                        style={
+                            "fontWeight": "700",
+                            "fontSize": "1.9rem",
+                            "marginBottom": "20px",
+                            "color": EUPHAColors.dark_blue
+                        }
+                    ),
+
+                    html.Hr(style={"margin": "1.2rem 0"}),
+
+                    html.H5(
+                        "How it works",
+                        style={
+                            "fontWeight": "500",
+                            "marginBottom": "12px",
+                            "marginTop": "45px"
+                        }
+                    ),
+
+                    html.Ol(
+                        [
+                            html.Li("Upload a PDF"),
+                            html.Li("Validate DOI + abstract"),
+                            html.Li("Validate authors"),
+                            html.Li("Click Upload file")
+                        ],
+                        style={
+                            "paddingLeft": "1.2rem",
+                            "marginLeft": "0",
+                            "lineHeight": "1.8"
+                        }
+                    ),
+                ],
+                style={
+                    "maxWidth": "240px",
+                    "margin": "0 auto"
+                }
+            )
         ],
         style={
-            "border-radius": "15px",
-            "padding": "20px",
-            "background-color": EUPHAColors.white,
-        },
+            "padding": "2rem 1rem",
+            "backgroundColor": EUPHAColors.white,
+            "height": "100vh",
+            "position": "fixed",
+            "top": 0,
+            "left": 0,
+            "width": "16%",
+            "borderRight": "1px solid #dee2e6"
+        }
     )
+
+    # Main page
+    main_content = html.Div(
+        [
+            html.Div(
+                [
+                    html.H1(
+                        "EU Fact Force - Article uploading page",
+                        className="mb-3 text-center",
+                        style={
+                            "fontWeight": "700",
+                            "fontSize": "2.5rem",
+                            "lineHeight": "1.15"
+                        }
+                    ),
+                    html.H3(
+                        "Welcome to EU Fact Force articles uploading pages",
+                        className="text-center mb-4",
+                        style={
+                            "color": EUPHAColors.black,
+                            "fontWeight": "500",
+                            "fontSize": "1.5rem",
+                            "lineHeight": "1.3"
+                        }
+                    ),
+                    html.P(
+                        "Thank you for collaborating with us, you will find here a page where you can upload and declare authors of your papers in attempt to build a safer and healthier community! Thank you for your contribution!",
+                        className="text-center mb-5",
+                        style={
+                            "maxWidth": "900px",
+                            "margin": "0 auto",
+                            "fontSize": "1.1rem",
+                            "lineHeight": "1.7",
+                            "color": EUPHAColors.black
+                        }
+                    ),
+                ],
+                style={
+                    "maxWidth": "1100px",
+                    "margin": "0 auto 2rem auto"
+                }
+            ),
+
+            dbc.Card([
+                dbc.CardBody([
+                    html.H4(
+                        "Upload & Metadatas",
+                        className="card-title font-weight-bold mb-4"
+                    ),
+                    dcc.Upload(
+                        id='upload-pdf',
+                        children=html.Div(['Drop your article here or ', html.A('Select a PDF', className="font-weight-bold")]),
+                        style={
+                            'width': '100%',
+                            'height': '80px',
+                            'lineHeight': '80px',
+                            'borderWidth': '2px',
+                            'borderStyle': 'dashed',
+                            'borderColor': EUPHAColors.dark_blue,
+                            'textAlign': 'center',
+                            'borderRadius': '10px',
+                            'marginBottom': '20px',
+                            'backgroundColor': EUPHAColors.white,
+                            'cursor': 'pointer'
+                        }
+                    ),
+                    html.H5("General informations", className="mt-4 font-weight-bold"),
+                    dbc.Row([
+                        dbc.Col([
+                            dbc.Label("Article Title"),
+                            dbc.Input(id='input-title', type='text', placeholder="Title of the article", className="mb-3"),
+
+                            dbc.Row([
+                                dbc.Col([
+                                    dbc.Label("Category"),
+                                    dcc.Dropdown(
+                                        id='input-category',
+                                        options=[
+                                            {'label': 'Scientific Article', 'value': 'scientific_article'},
+                                            {'label': 'Report', 'value': 'report'},
+                                            {'label': 'Thesis', 'value': 'thesis'},
+                                            {'label': 'Working Paper', 'value': 'working_paper'},
+                                            {'label': 'Book Chapter', 'value': 'book_chapter'},
+                                            {'label': 'Other', 'value': 'other'}
+                                        ],
+                                        value='scientific_article',
+                                        className="mb-3"
+                                    ),
+                                ], width=6),
+                                dbc.Col([
+                                    dbc.Label("Study Type"),
+                                    dcc.Dropdown(
+                                        id='input-type',
+                                        options=[
+                                            {'label': 'Meta-analysis', 'value': 'meta_analysis'},
+                                            {'label': 'Systematic review', 'value': 'systematic_review'},
+                                            {'label': 'Evidence review', 'value': 'evidence_review'},
+                                            {'label': 'Cohort study', 'value': 'cohort_study'},
+                                            {'label': 'Case-control study', 'value': 'case_control_study'},
+                                            {'label': 'Cross-sectional study', 'value': 'cross_sectional_study'},
+                                            {'label': 'Randomized controlled trial', 'value': 'rct'},
+                                            {'label': 'Other', 'value': 'other'}
+                                        ],
+                                        className="mb-3"
+                                    ),
+                                ], width=6),
+                            ]),
+                            dbc.Label("Journal / Source"),
+                            dbc.Input(id='input-journal', type='text', placeholder="ex: The Lancet Public Health", className="mb-3"),
+
+                            dbc.Row([
+                                dbc.Col([
+                                    dbc.Label("Publication Year"),
+                                    dbc.Input(id='input-date', type='text', placeholder="ex: 2023"),
+                                ], width=6),
+                                dbc.Col([
+                                    dbc.Label("DOI"),
+                                    dbc.Input(id='input-doi', type='text', placeholder="ex: 10.1038/s41586-021-00000-x"),
+                                ], width=6),
+                            ], className="mb-3"),
+
+                            dbc.Label("Publication URL"),
+                            dbc.Input(id='input-link', type='text', placeholder="https://pubmed.ncbi.nlm.nih.gov/...", className="mb-3"),
+
+                            dbc.Label("Abstract"),
+                            dbc.Textarea(id='input-abstract', style={'height': 150}, placeholder="Lorem ipsum dolor sit amet"),
+
+                            dbc.Checkbox(id='chk-meta-correct', label="This information is correct", className="mt-3 font-weight-bold text-success"),
+                        ], width=12)
+                    ]),
+                ])
+            ], className="mb-4 shadow-sm", style={"borderRadius": "16px"}),
+
+            dbc.Card([
+                dbc.CardBody([
+                    html.H4(
+                        "Authors",
+                        className="card-title font-weight-bold mb-4"
+                    ),
+                    html.Div(id='authors-container'),
+                    dbc.Button(
+                        "➕ Add an author",
+                        id='btn-add-author',
+                        n_clicks=0,
+                        outline=True,
+                        className="mt-3",
+                        style={
+                            "color": "#3B6096",
+                            "borderColor": "#3B6096",
+                            "borderRadius": "10px",
+                            "fontWeight": "500"
+                        }
+                    ),
+                    html.Br(),
+                    dbc.Checkbox(id='chk-authors-correct', label="Authors information is correct", className="mt-3 font-weight-bold text-success"),
+                ])
+            ], className="mb-4 shadow-sm", style={"borderRadius": "16px"}),
+
+            dbc.Button(
+                "Upload file",
+                id='btn-final-upload',
+                size="lg",
+                className="w-100 mb-4",
+                style={
+                    "backgroundColor": EUPHAColors.dark_blue,
+                    "borderColor": EUPHAColors.dark_blue,
+                    "color": "white",
+                    "fontWeight": "600",
+                    "borderRadius": "10px"
+                }
+            ),
+
+            html.Div(id='final-output', className="mt-4 pb-5")
+        ],
+        style={
+            "marginLeft": "16%",
+            "padding": "5rem 1.5rem 2rem 1.5rem",
+            "width": "84%",
+            "backgroundColor": "#ffffff"
+        }
+    )
+
+    return html.Div([
+        dcc.Store(id='session-store', data={}),
+        sidebar, main_content],
+        style={"fontFamily": "system-ui, -apple-system, sans-serif",
+            "backgroundColor": "#f5f7fa"})
+
+
+
+
+def add_author_line(index, name="", surname="", email=""):
+    """One-click addition/suppression of a new author line"""
+
+    return dbc.Card([
+        dbc.CardBody([
+            dbc.Row([
+                dbc.Col(
+                    dbc.Input(
+                        id={'type': 'auth-name', 'index': index},
+                        value=name,
+                        placeholder="Name"
+                    ),
+                    width=3
+                ),
+                dbc.Col(
+                    dbc.Input(
+                        id={'type': 'auth-surname', 'index': index},
+                        value=surname,
+                        placeholder="Surname"
+                    ),
+                    width=3
+                ),
+                dbc.Col(
+                    dbc.Input(
+                        id={'type': 'auth-email', 'index': index},
+                        value=email,
+                        placeholder="Email (Corresponding)"
+                    ),
+                    width=4
+                ),
+                dbc.Col(
+                    dbc.Button(
+                        "Remove",
+                        id={'type': 'remove-author', 'index': index},
+                        color="danger",
+                        outline=True,
+                        className="w-100",
+                        style={
+                            "whiteSpace": "nowrap",
+                            "minWidth": "100px"
+                        }
+                    ),
+                    width=2
+                )
+            ], className="align-items-center g-2")
+        ], className="p-2")
+    ], className="mb-3 border-light shadow-sm")
diff --git a/eu_fact_force/dash-app/utils/parsing.py b/eu_fact_force/dash-app/utils/parsing.py
new file mode 100644
index 0000000..3300782
--- /dev/null
+++ b/eu_fact_force/dash-app/utils/parsing.py
@@ -0,0 +1,222 @@
+import base64
+from typing import Optional
+import re
+import fitz  # PyMuPDF
+
+def load_png_as_data_uri(png_path: str) -> Optional[str]:
+    """Return a data URI for an PNG file, or None if not found."""
+    try:
+        with open(png_path, "rb") as f:
+            b64 = base64.b64encode(f.read()).decode("utf-8")
+        return f"data:image/png+xml;base64,{b64}"
+    except FileNotFoundError:
+        return None
+
+def extract_doi_from_pdf(text: str) -> Optional[str]:
+    """Extract DOI from PDF text using regex pattern."""
+    # Pattern for DOI: 10.xxxx/xxxxx
+    match = re.search(r'(?:doi[:\s]+)?(?:https?://)?(?:dx\.)?doi\.org/(10\.\S+)', text, re.IGNORECASE)
+    if match:
+        return match.group(1) if match.group(1).startswith('10.') else match.group(0)
+
+    # Alternative pattern
+    match = re.search(r'10\.\d{4,}/\S+', text)
+    if match:
+        return match.group(0)
+    return None
+
+
+def extract_abstract_from_pdf(text: str) -> Optional[str]:
+    """Extract abstract from PDF text."""
+    # Look for "Abstract" section
+    abstract_pattern = r'(?:abstract|summary)\s*[:]*\s*(.+?)(?=(?:introduction|keywords|1\.\s|methods|methodology|introduction|related work|background)|\Z)'
+    match = re.search(abstract_pattern, text, re.IGNORECASE | re.DOTALL)
+    if match:
+        abstract_text = match.group(1).strip()
+        # Clean up and limit to reasonable length
+        abstract_text = re.sub(r'\s+', ' ', abstract_text)[:500]
+        return abstract_text if len(abstract_text) > 20 else None
+    return None
+
+
+def extract_authors_from_pdf(text: str) -> list[dict]:
+    """Extract authors by finding the typical author line in scientific papers."""
+    authors = []
+
+    def clean_name(name: str) -> str:
+        # Supprime chiffres, *, †, § collés au nom
+        return re.sub(r'[\d\*†‡§]+', '', name).strip()
+
+    lines = text.split('\n')[:50]
+
+    for line in lines:
+        line = line.strip()
+
+        # Une ligne d'auteurs contient typiquement "and" ou une virgule
+        # et ressemble à des noms propres (Majuscule, pas trop longue)
+        if len(line) > 150 or len(line) < 5:
+            continue
+        if not re.search(r'\band\b|,', line):
+            continue
+        # Doit commencer par une majuscule
+        if not re.match(r'^[A-Z]', line):
+            continue
+        # Ne doit pas contenir de mots typiques de non-auteurs
+        skip_words = ['abstract', 'keywords', 'introduction', 'figure',
+                      'table', 'doi', 'http', 'university', 'institute',
+                      'open access', 'copyright', 'license', 'received']
+        if any(w in line.lower() for w in skip_words):
+            continue
+        # Tous les "mots" (après nettoyage) doivent ressembler à des noms propres
+        # càd commencer par une majuscule ou être un chiffre/symbole
+        test_line = clean_name(line)
+        words = [w for w in re.split(r'[\s,]+', test_line) if w]
+        if not words:
+            continue
+        # Au moins 80% des mots doivent commencer par une majuscule
+        capitalized = sum(1 for w in words if re.match(r'^[A-Z]', w) or w.lower() == 'and')
+        if capitalized / len(words) < 0.8:
+            continue
+
+        # C'est probablement une ligne d'auteurs — on parse
+        raw_names = re.split(r',\s*|\s+and\s+', line)
+        for raw in raw_names:
+            name = clean_name(raw).strip()
+            if not name or len(name) < 3:
+                continue
+            parts = name.split()
+            if len(parts) >= 2:
+                authors.append({
+                    "name": " ".join(parts[:-1]),
+                    "surname": parts[-1],
+                    "email": ""
+                })
+
+    # Rattache l'email du corresponding author
+    corr_match = re.search(
+        r'\*Correspondence[:\s]+([A-Z][a-z]+(?:[\s\-][A-Za-z\-]+)+)\s+([\w.\-]+@[\w.\-]+\.\w+)',
+        text
+    )
+    if corr_match and authors:
+        corr_name = re.sub(r'[\d\*†‡§]+', '', corr_match.group(1)).strip()
+        corr_email = corr_match.group(2)
+        for author in authors:
+            full = f"{author['name']} {author['surname']}"
+            if corr_name in full or full in corr_name:
+                author['email'] = corr_email
+
+    return authors[:10]
+
+def extract_date_from_pdf(text: str) -> Optional[str]:
+    """Extract publication date (year only) from PDF text."""
+    # YYYY-MM-DD or YYYY/MM/DD
+    match = re.search(r'\b((?:19|20)\d{2})[-/.](?:0[1-9]|1[012])[-/.](?:0[1-9]|[12][0-9]|3[01])\b', text)
+    if match:
+        return match.group(1)
+
+    # DD-MM-YYYY or DD/MM/YYYY
+    match = re.search(r'\b(?:0[1-9]|[12][0-9]|3[01])[-/.](?:0[1-9]|1[012])[-/.]((?:19|20)\d{2})\b', text)
+
+    # Pattern: Month Year (e.g., "January 2023")
+    match = re.search(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+((?:19|20)\d{2})\b', text, re.IGNORECASE)
+    if match:
+        return match.group(1)
+
+    # We look for years typically appearing in headers or near "Copyright" or "Received"
+    # Just a year (between 1900 and 2099)
+    match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
+    if match:
+        return match.group(1)
+
+    return None
+
+def extract_journal_from_pdf(text: str) -> Optional[str]:
+    """Extract journal name from PDF text."""
+    journal_patterns = [
+        r'Published in\s*[:]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
+        r'Journal of\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)',
+        r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\s+Journal)',
+        r'Source\s*[:]?\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)'
+    ]
+    for pattern in journal_patterns:
+        match = re.search(pattern, text)
+        if match:
+            return match.group(1).strip()
+
+    # Try to find it in the first few lines if not found by pattern
+    lines = text.split('\n')[:15]
+    for line in lines:
+        line = line.strip()
+        if any(kw in line for kw in ["Journal", "Review", "Nature", "Science", "Lancet", "Medicine"]):
+            if len(line.split()) < 10: # Avoid long sentences
+                return line
+    return None
+
+def extract_link_from_pdf(text: str, doi: Optional[str] = None) -> Optional[str]:
+    """Extract article link from PDF text or DOI."""
+    if doi:
+        return f"https://doi.org/{doi}"
+
+    # Look for https links that might be the editor's link
+    links = re.findall(r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*)', text)
+    for link in links:
+        if any(domain in link for domain in ['sciencedirect', 'springer', 'wiley', 'nature.com', 'thelancet', 'bmj', 'frontiersin', 'plos', 'pubmed.ncbi.nlm.nih.gov'\
+            , 'who.int', 'cdc.gov', 'acpjournals', 'nejm.org', 'jama.jamanetwork.com']):
+            return link
+    return links[0] if links else None
+
+def extract_text_by_blocks(uploaded_file_bytes) -> str:
+    doc = fitz.open(stream=uploaded_file_bytes, filetype="pdf")
+    full_text = ""
+    for page in doc[:3]:
+        # Trie les blocs par position verticale puis horizontale
+        blocks = page.get_text("blocks")
+        blocks.sort(key=lambda b: (round(b[1] / 20), b[0]))
+        for block in blocks:
+            full_text += block[4] + "\n"
+    return full_text
+
+def extract_title_from_pdf(text: str) -> Optional[str]:
+    """Try to extract the title from the first few lines of the PDF."""
+    lines = [line.strip() for line in text.split('\n') if line.strip()]
+    if not lines:
+        return None
+
+    # Typically the title is in the first few lines, is not too long,
+    # and doesn't contain certain keywords.
+    for line in lines[:10]:
+        # Skip lines that are likely not titles (e.g., journal names, DOI, authors)
+        if any(kw in line.lower() for kw in ["journal", "doi:", "http", "vol.", "issn", "received:", "accepted:", "copyright"]):
+            continue
+        # Titles are usually at least 3 words and not excessively long (e.g. < 250 chars)
+        if 3 <= len(line.split()) <= 40 and len(line) < 450:
+            return line
+    return None
+
+def extract_pdf_metadata(uploaded_file) -> dict:
+    """Extract metadata from PDF file."""
+    metadata = {
+        "title": None,
+        "doi": None,
+        "abstract": None,
+        "publication_date": None,
+        "journal": None,
+        "article_link": None,
+        "authors": []
+    }
+    try:
+        # Extract text from PDF
+        pdf_text = extract_text_by_blocks(uploaded_file.read())
+
+        # Extract metadata
+        metadata["title"] = extract_title_from_pdf(pdf_text)
+        metadata["doi"] = extract_doi_from_pdf(pdf_text)
+        metadata["abstract"] = extract_abstract_from_pdf(pdf_text)
+        metadata["authors"] = extract_authors_from_pdf(pdf_text)
+        metadata["publication_date"] = extract_date_from_pdf(pdf_text)
+        metadata["journal"] = extract_journal_from_pdf(pdf_text)
+        metadata["article_link"] = extract_link_from_pdf(pdf_text, metadata["doi"])
+
+    except Exception as e:
+        print(f"Error processing PDF: {e}")
+    return metadata