From 4792125776cbc6b68bbd01fdc7b96c9c8617f196 Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Thu, 29 May 2025 02:25:22 +0900
Subject: [PATCH 1/8] feat: update full PDF upload, add fund.js, upgrade
 metadata tags to v2.0.0

---
 .gitignore                 | Bin 466 -> 506 bytes
 1_upload_basic_metadata.js |  43 ++++++-----
 2_upload_pdf.js            | 142 ++++++++++++-------------------------
 fund.js                    |  51 +++++++++++++
 package.json               |  13 +++-
 5 files changed, 129 insertions(+), 120 deletions(-)
 create mode 100644 fund.js

diff --git a/.gitignore b/.gitignore
index 9bbd1f23556c0f07691ddf27e495da3c9a2c4cfe..0cd1ee7882957fdfd807d9ee0b28803724029023 100644
GIT binary patch
delta 15
Wcmcb_{EK-*0ORB`M!m^lj5Po-papUO

delta 11
Scmeyxe2IBO0ORBe#wGwAP6T2A

diff --git a/1_upload_basic_metadata.js b/1_upload_basic_metadata.js
index caf1a99..d0c70bd 100644
--- a/1_upload_basic_metadata.js
+++ b/1_upload_basic_metadata.js
@@ -4,17 +4,19 @@ const { Solana } = require("@irys/upload-solana");
 const fs = require("fs").promises;
 const path = require("path");
 
+// 初始化上传器
 const getIrysUploader = async () => {
     try {
         const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
-        console.log("Irys uploader initialized.");
+        console.log("✅ Irys uploader initialized.");
         return irysUploader;
     } catch (error) {
-        console.error("Failed to initialize Irys uploader:", error);
+        console.error("❌ Failed to initialize Irys uploader:", error);
         return null;
     }
 };
 
+// 上传 basic_metadata.json 中的数据
 const uploadBasicMetadata = async () => {
     const irys = await getIrysUploader();
     if (!irys) {
@@ -23,14 +25,13 @@ const uploadBasicMetadata = async () => {
     }
 
     try {
-        // Read the basic_metadata.json file
         const filePath = path.join(process.cwd(), 'basic_metadata.json');
-        console.log(`Reading file: ${filePath}`);
+        console.log(`📄 Reading file: ${filePath}`);
         
         const content = await fs.readFile(filePath, 'utf8');
         const papers = JSON.parse(content);
         
-        console.log(`Loaded ${papers.length} papers for processing`);
+        console.log(`📚 Loaded ${papers.length} papers for processing`);
 
         let successCount = 0;
         let failCount = 0;
@@ -47,12 +48,12 @@ const uploadBasicMetadata = async () => {
 
             try {
                 const normalizedDoi = paper.doi.trim();
-                const normalizedTitle = paper.title
-                    .replace(/\s+/g, ' ')  // Replace multiple spaces with single space
-                    .replace(/\n/g, '')    // Remove newlines
-                    .trim();               // Remove leading/trailing spaces
+                const normalizedTitle = (paper.title || "")
+                    .replace(/\s+/g, ' ')
+                    .replace(/\n/g, '')
+                    .trim();
 
-                const normalizedAuthors = paper.authors
+                const normalizedAuthors = (paper.authors || "")
                     .replace(/\s+/g, ' ')
                     .replace(/\n/g, '')
                     .trim();
@@ -60,17 +61,17 @@ const uploadBasicMetadata = async () => {
                 const tags = [
                     { name: "App-Name", value: "scivault" },
                     { name: "Content-Type", value: "application/json" },
-                    { name: "Version", value: "1.0.3" },
+                    { name: "Version", value: "2.0.0" },
                     { name: "doi", value: normalizedDoi },
                     { name: "title", value: normalizedTitle },
                     { name: "authors", value: normalizedAuthors },
-                    { name: "aid", value: paper.aid }
+                    { name: "aid", value: paper.aid || "" }
                 ];
 
                 const paperMetadata = Buffer.from(JSON.stringify(paper));
                 const receipt = await irys.upload(paperMetadata, { tags });
 
-                console.log(`✅ Uploaded: ${paper.doi} (${receipt.id})`);
+                console.log(`✅ Uploaded: ${normalizedDoi} (${receipt.id})`);
                 successCount++;
 
             } catch (error) {
@@ -78,25 +79,23 @@ const uploadBasicMetadata = async () => {
                 failCount++;
             }
 
-            // Progress report every 10 papers
             if ((i + 1) % 10 === 0 || i === papers.length - 1) {
                 console.log(`\n📊 Progress Report:`);
-                console.log(`   Success: ${successCount}`);
-                console.log(`   Failed: ${failCount}`);
-                console.log(`   Progress: ${Math.round((i + 1) / papers.length * 100)}%`);
+                console.log(`   ✅ Success: ${successCount}`);
+                console.log(`   ❌ Failed: ${failCount}`);
+                console.log(`   🔄 Progress: ${Math.round((i + 1) / papers.length * 100)}%`);
             }
         }
 
         console.log(`\n✨ Upload Complete`);
-        console.log(`   Final Results:`);
-        console.log(`   Total Success: ${successCount}`);
-        console.log(`   Total Failed: ${failCount}`);
-        console.log(`   Success Rate: ${Math.round(successCount / papers.length * 100)}%`);
+        console.log(`   ✅ Total Success: ${successCount}`);
+        console.log(`   ❌ Total Failed: ${failCount}`);
+        console.log(`   📈 Success Rate: ${Math.round(successCount / papers.length * 100)}%`);
 
     } catch (error) {
         console.error("❌ Error uploading metadata:", error);
     }
 };
 
-// Run the upload process
+// 执行上传
 uploadBasicMetadata().catch(console.error);
diff --git a/2_upload_pdf.js b/2_upload_pdf.js
index d468847..e6de4bf 100644
--- a/2_upload_pdf.js
+++ b/2_upload_pdf.js
@@ -1,23 +1,22 @@
 require("dotenv").config();
 const { Uploader } = require("@irys/upload");
 const { Solana } = require("@irys/upload-solana");
-const { PDFDocument } = require("pdf-lib");
 const fs = require("fs").promises;
 const path = require("path");
 
-const MAX_SLICE_SIZE = 50 * 1024; // 50KB per slice
-
+// 初始化 Irys 上传器
 const getIrysUploader = async () => {
     try {
         const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
-        console.log("Irys uploader initialized.");
+        console.log("✅ Irys uploader initialized.");
         return irysUploader;
     } catch (error) {
-        console.error("Failed to initialize Irys uploader:", error);
+        console.error("❌ Failed to initialize Irys uploader:", error);
         return null;
     }
 };
 
+// 遍历目录查找 PDF 文件
 async function walkDir(dir) {
     try {
         const files = await fs.readdir(dir);
@@ -29,64 +28,40 @@ async function walkDir(dir) {
     }
 }
 
+// 读取 JSON 元数据文件中的 DOI
 async function getDoiFromMetadata(pdfPath) {
     try {
-        // Get the corresponding JSON file path by replacing .pdf with .json
         const jsonPath = pdfPath.replace('.pdf', '.json');
-        
-        console.log(`Looking for metadata file: ${jsonPath}`);
-        
-        // Read and parse the JSON file
+        console.log(`🔍 Looking for metadata file: ${jsonPath}`);
         const jsonData = await fs.readFile(jsonPath, 'utf8');
         const metadata = JSON.parse(jsonData);
-        
-        if (!metadata.doi) {
-            throw new Error(`No DOI found in metadata file: ${jsonPath}`);
-        }
-
-        console.log(`Found DOI: ${metadata.doi}`);
+        if (!metadata.doi) throw new Error(`No DOI found in metadata file: ${jsonPath}`);
+        console.log(`✅ Found DOI: ${metadata.doi}`);
         return metadata.doi;
     } catch (error) {
-        console.error(`Error getting DOI from metadata:`, error);
+        console.error(`❌ Error getting DOI from metadata:`, error);
         throw error;
     }
 }
 
-const sliceAndUploadPdf = async (inputPath, doi) => {
+// 上传单个 PDF（不再切片）
+const uploadPdf = async (inputPath, doi) => {
     try {
         console.log(`\n📄 Processing PDF: ${path.basename(inputPath)}`);
-        
-        // Read and validate PDF
-        const pdfBytes = await fs.readFile(inputPath);
-        const pdfDoc = await PDFDocument.load(pdfBytes);
-        const fileBase64 = await pdfDoc.saveAsBase64();
-
-        // Create chunks
-        const chunks = [];
-        for (let i = 0; i < fileBase64.length; i += MAX_SLICE_SIZE) {
-            const chunk = fileBase64.slice(i, i + MAX_SLICE_SIZE);
-            chunks.push(chunk);
-        }
-
-        console.log(`File size: ${fileBase64.length} bytes`);
-        console.log(`Total chunks: ${chunks.length}`);
 
-        // Check if PDF was already uploaded
+        // 1. 检查是否已上传过
         const query = `
             query {
                 transactions(
                     tags: [
+                        { name: "App-Name", values: ["scivault"] },
                         { name: "Content-Type", values: ["application/pdf"] },
-                        { name: "application", values: ["scivault"] },
-                        { name: "Version", values: ["1.0.3"] },
-                        { name: "Type", values: ["pdf-index"] },
-                        { name: "Collection", values: ["${doi}"] }
+                        { name: "Version", values: ["2.0.0"] },
+                        { name: "doi", values: ["${doi}"] }
                     ]
                 ) {
                     edges {
-                        node {
-                            id
-                        }
+                        node { id }
                     }
                 }
             }
@@ -100,56 +75,42 @@ const sliceAndUploadPdf = async (inputPath, doi) => {
 
         const result = await response.json();
         if (result.data?.transactions?.edges?.[0]?.node?.id) {
-            console.log(`⚠️ PDF already exists for DOI: ${doi}`);
+            console.log(`⚠️ PDF already uploaded for DOI: ${doi}`);
             return result.data.transactions.edges.map(edge => edge.node.id);
         }
 
-        // Upload chunks
+        // 2. 上传 PDF
         const irys = await getIrysUploader();
-        if (!irys) {
-            throw new Error("Failed to initialize Irys uploader");
-        }
+        if (!irys) throw new Error("Failed to initialize Irys uploader");
 
-        const receiptIDs = [];
+        const buffer = await fs.readFile(inputPath);
         const tags = [
+            { name: "App-Name", value: "scivault" },
             { name: "Content-Type", value: "application/pdf" },
-            { name: "application", value: "scivault" },
-            { name: "Version", value: "1.0.3" },
-            { name: "Type", value: "pdf-index" },
-            { name: "Collection", value: doi }
+            { name: "Version", value: "2.0.0" },
+            { name: "doi", value: doi }
         ];
 
-        for (let i = 0; i < chunks.length; i++) {
-            console.log(`\nUploading chunk ${i + 1}/${chunks.length}...`);
-            const receipt = await irys.upload(Buffer.from(chunks[i]), { tags });
-            receiptIDs.push(receipt.id);
-            console.log(`✅ Chunk uploaded: ${receipt.id}`);
-        }
-
-        console.log(`\n✨ PDF uploaded successfully!`);
-        console.log(`Receipt IDs: ${receiptIDs.join(", ")}`);
-        return receiptIDs;
+        const receipt = await irys.upload(buffer, { tags });
+        console.log(`✅ PDF uploaded successfully. Transaction ID: ${receipt.id}`);
+        return [receipt.id];
 
     } catch (error) {
-        console.error(`❌ Error processing PDF: ${error.message}`);
+        console.error(`❌ Error uploading PDF: ${error.message}`);
         throw error;
     }
 };
 
-// 添加错误日志功能
+// 错误记录
 async function logError(filePath, error, doi = null) {
     const errorLogPath = path.join(process.cwd(), 'upload_errors.json');
     try {
-        // 读取现有的错误日志，如果不存在则创建新的
         let errorLog = [];
         try {
             const existingLog = await fs.readFile(errorLogPath, 'utf8');
             errorLog = JSON.parse(existingLog);
-        } catch (e) {
-            // 文件不存在，使用空数组
-        }
+        } catch {}
 
-        // 添加新的错误记录
         errorLog.push({
             timestamp: new Date().toISOString(),
             file: filePath,
@@ -158,18 +119,18 @@ async function logError(filePath, error, doi = null) {
             stack: error.stack
         });
 
-        // 保存更新后的错误日志
         await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2));
-        console.log(`Error logged to ${errorLogPath}`);
+        console.log(`📝 Error logged to ${errorLogPath}`);
     } catch (logError) {
-        console.error('Failed to log error:', logError);
+        console.error('❌ Failed to log error:', logError);
     }
 }
 
+// 批量上传 PDF 主函数
 const uploadPdfs = async (pdfDir) => {
     try {
         const files = await walkDir(pdfDir);
-        console.log(`Found ${files.length} PDF files in ${pdfDir}`);
+        console.log(`\n📁 Found ${files.length} PDF files in ${pdfDir}`);
 
         let successCount = 0;
         let failCount = 0;
@@ -179,34 +140,24 @@ const uploadPdfs = async (pdfDir) => {
             const pdfFile = files[i];
             let doi = null;
             try {
-                // 获取 DOI
                 doi = await getDoiFromMetadata(pdfFile);
-                console.log(`\nProcessing PDF: ${path.basename(pdfFile)}`);
-                console.log(`Using DOI: ${doi}`);
-                
-                // 尝试上传
-                await sliceAndUploadPdf(pdfFile, doi);
+                await uploadPdf(pdfFile, doi);
                 successCount++;
             } catch (error) {
                 failCount++;
                 await logError(pdfFile, error, doi);
-                errorFiles.push({
-                    file: pdfFile,
-                    doi: doi,
-                    error: error.message
-                });
+                errorFiles.push({ file: pdfFile, doi: doi, error: error.message });
             }
 
-            // Progress report
             if ((i + 1) % 5 === 0 || i === files.length - 1) {
                 console.log(`\n📊 Progress Report:`);
-                console.log(`   Success: ${successCount}`);
-                console.log(`   Failed: ${failCount}`);
-                console.log(`   Progress: ${Math.round((i + 1) / files.length * 100)}%`);
+                console.log(`   ✅ Success: ${successCount}`);
+                console.log(`   ❌ Failed: ${failCount}`);
+                console.log(`   🔄 Progress: ${Math.round((i + 1) / files.length * 100)}%`);
             }
         }
 
-        // 在完成时生成详细报告
+        // 写入报告
         const report = {
             timestamp: new Date().toISOString(),
             totalFiles: files.length,
@@ -216,17 +167,15 @@ const uploadPdfs = async (pdfDir) => {
             failedFiles: errorFiles
         };
 
-        // 保存报告
         const reportPath = path.join(process.cwd(), 'upload_report.json');
         await fs.writeFile(reportPath, JSON.stringify(report, null, 2));
 
         console.log(`\n🎉 Upload Complete`);
-        console.log(`   Total Success: ${successCount}`);
-        console.log(`   Total Failed: ${failCount}`);
-        console.log(`   Success Rate: ${Math.round(successCount / files.length * 100)}%`);
-        console.log(`   Detailed report saved to: ${reportPath}`);
+        console.log(`   ✅ Total Success: ${successCount}`);
+        console.log(`   ❌ Total Failed: ${failCount}`);
+        console.log(`   📄 Report saved to: ${reportPath}`);
         if (failCount > 0) {
-            console.log(`   Error log saved to: upload_errors.json`);
+            console.log(`   📌 Error log saved to: upload_errors.json`);
         }
 
     } catch (error) {
@@ -235,14 +184,15 @@ const uploadPdfs = async (pdfDir) => {
     }
 };
 
-// If running directly
+// CLI 执行入口
 if (require.main === module) {
     const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata');
     uploadPdfs(metadataDir).catch(console.error);
 }
 
+// 可导出函数供其他模块调用
 module.exports = {
     getIrysUploader,
-    sliceAndUploadPdf,
+    uploadPdf,
     uploadPdfs
 };
diff --git a/fund.js b/fund.js
new file mode 100644
index 0000000..2efad66
--- /dev/null
+++ b/fund.js
@@ -0,0 +1,51 @@
+require("dotenv").config();
+const readline = require("readline");
+const { Uploader } = require("@irys/upload");
+const { Solana } = require("@irys/upload-solana");
+
+const rl = readline.createInterface({
+  input: process.stdin,
+  output: process.stdout
+});
+
+const askUser = (question) => {
+  return new Promise((resolve) => {
+    rl.question(question, (answer) => {
+      resolve(answer.trim().toLowerCase());
+    });
+  });
+};
+
+const main = async () => {
+  try {
+    const irys = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
+
+    const address = irys.address;
+    const token = irys.token;
+
+    const atomicBalance = await irys.getLoadedBalance();
+    const balance = irys.utils.fromAtomic(atomicBalance);
+
+    console.log(`\n🌐 Public Address: ${address}`);
+    console.log(`💰 Current Irys Balance: ${balance} ${token}`);
+    console.log(`🔗 Check wallet on Solana Explorer: https://explorer.solana.com/address/${address}?cluster=mainnet`);
+
+    const answer = await askUser("\n🪙 Do you want to fund 0.01 SOL to Irys? (yes/no): ");
+
+    if (answer === "yes" || answer === "y") {
+      const amount = "0.01";
+      console.log(`\n⛽ Funding ${amount} SOL to Irys...`);
+
+      const fundResult = await irys.fund(irys.utils.toAtomic(amount));
+      console.log(`✅ Fund successful! Transaction ID: ${fundResult.id}`);
+    } else {
+      console.log("ℹ️ Funding skipped.");
+    }
+  } catch (err) {
+    console.error("❌ Failed to get balance or fund Irys:", err);
+  } finally {
+    rl.close();
+  }
+};
+
+main();
diff --git a/package.json b/package.json
index 06f3ca5..a2fb714 100644
--- a/package.json
+++ b/package.json
@@ -28,5 +28,14 @@
     "decentralized"
   ],
   "author": "SciVault",
-  "license": "MIT"
-} 
\ No newline at end of file
+  "license": "MIT",
+  "devDependencies": {},
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/Scihub-Community/sciuploader.git"
+  },
+  "bugs": {
+    "url": "https://github.com/Scihub-Community/sciuploader/issues"
+  },
+  "homepage": "https://github.com/Scihub-Community/sciuploader#readme"
+}

From d948daecfbcba44507b94e3cc4b670c24706618e Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Thu, 29 May 2025 02:46:40 +0900
Subject: [PATCH 2/8] docs: update README with setup and usage instructions

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c626acd..dc773cd 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ A decentralized academic paper repository system built on Arweave/Irys.
 
 1. Clone this repository:
    ```bash
-   git clone https://github.com/yourusername/scivault.git
+   git clone https://github.com/SciVault/sciuploader
    cd sciuploader
    ```
 

From b326bfbcd0b8a5f5b390604d9dc2de92c0690a81 Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Thu, 29 May 2025 03:00:22 +0900
Subject: [PATCH 3/8] docs: update README with setup and usage instructions

---
 README.md | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/README.md b/README.md
index dc773cd..1715ff0 100644
--- a/README.md
+++ b/README.md
@@ -54,17 +54,11 @@ node 2_upload_pdf.js
 
 Note: If uploads fail due to network issues, you can safely run the script again. It will skip already uploaded files and continue with failed ones.
 
-### Step 3: Upload Complete Metadata
-
-Upload the complete metadata with all paper details:
-```bash
-node 3_upload_all_metadata.js
-```
 
 ## Version Control
 
 The system uses semantic versioning for content management:
-- Current version: `1.0.3`
+- Current version: `2.0.0`
 - Format: `MAJOR.MINOR.PATCH`
   - MAJOR: Breaking changes
   - MINOR: New features

From 5cdb7f24aa19c84dcbe7307500e7353b289e047e Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Tue, 3 Jun 2025 19:36:19 +0900
Subject: [PATCH 4/8] Refactor: restructure script flow and replace old upload
 logic

---
 0_generate_basic_metadata.js   | 100 -----------------
 0_run_workflow.js              |  57 ++++++++++
 1_fetch_all_dois.js            |  88 +++++++++++++++
 1_upload_basic_metadata.js     | 101 -----------------
 2_fetch_all_pdfs.js            | 165 +++++++++++++++++++++++++++
 2_upload_pdf.js                | 198 --------------------------------
 3_generate_basic_metadata.js   |  99 ++++++++++++++++
 3_upload_all_metadata.js       | 200 ---------------------------------
 4_upload_all_basic_metadata.js | 131 +++++++++++++++++++++
 5_upload_all_pdfs.js           | 168 +++++++++++++++++++++++++++
 fund.js => 9_fund.js           |   0
 package.json                   |   7 +-
 12 files changed, 713 insertions(+), 601 deletions(-)
 delete mode 100644 0_generate_basic_metadata.js
 create mode 100644 0_run_workflow.js
 create mode 100644 1_fetch_all_dois.js
 delete mode 100644 1_upload_basic_metadata.js
 create mode 100644 2_fetch_all_pdfs.js
 delete mode 100644 2_upload_pdf.js
 create mode 100644 3_generate_basic_metadata.js
 delete mode 100644 3_upload_all_metadata.js
 create mode 100644 4_upload_all_basic_metadata.js
 create mode 100644 5_upload_all_pdfs.js
 rename fund.js => 9_fund.js (100%)

diff --git a/0_generate_basic_metadata.js b/0_generate_basic_metadata.js
deleted file mode 100644
index 9e4804f..0000000
--- a/0_generate_basic_metadata.js
+++ /dev/null
@@ -1,100 +0,0 @@
-const fs = require('fs').promises;
-const path = require('path');
-
-async function walkDir(dir) {
-    try {
-        const files = await fs.readdir(dir);
-        const jsonFiles = files.filter(file => file.endsWith('.json'));
-        return jsonFiles.map(file => path.join(dir, file));
-    } catch (error) {
-        console.error('Error reading directory:', error);
-        throw error;
-    }
-}
-
-function extractAbstract(paper) {
-    // Try to reconstruct abstract from inverted index if available
-    if (paper.openalex?.abstract_inverted_index) {
-        const words = [];
-        const index = paper.openalex.abstract_inverted_index;
-        const maxPosition = Math.max(...Object.values(index).flat());
-        
-        for (let i = 0; i <= maxPosition; i++) {
-            for (const [word, positions] of Object.entries(index)) {
-                if (positions.includes(i)) {
-                    words[i] = word;
-                    break;
-                }
-            }
-        }
-        return words.join(' ');
-    }
-    return ""; // Return empty string if no abstract found
-}
-
-function extractBasicMetadata(paper) {
-    return {
-        abstract: extractAbstract(paper),
-        title: paper.openalex?.title || 
-               paper.crossref?.title?.[0] || 
-               "",
-        authors: paper.openalex?.authorships
-            ?.map(a => a.raw_author_name)
-            .join(", ") ||
-            paper.crossref?.author
-            ?.map(a => `${a.given} ${a.family}`)
-            .join(", ") ||
-            "",
-        doi: paper.doi || "",
-        aid: paper.openalex?.id?.split("/").pop() || 
-             paper.crossref?.DOI?.replace(/[^a-zA-Z0-9]/g, "") || 
-             ""
-    };
-}
-
-async function generateBasicMetadata(metadataDir) {
-    try {
-        // Get all JSON files in the directory
-        const files = await walkDir(metadataDir);
-        
-        // Process each file
-        const metadata = [];
-        for (const file of files) {
-            try {
-                console.log(`Processing file: ${file}`);  // Add logging
-                const content = await fs.readFile(file, 'utf8');
-                const paper = JSON.parse(content.trim());  // Add trim() to remove any BOM or whitespace
-                
-                const basicMetadata = extractBasicMetadata(paper);
-                metadata.push(basicMetadata);
-            } catch (error) {
-                console.error(`Error processing file ${file}:`, error);
-                // Continue with next file instead of stopping
-                continue;
-            }
-        }
-
-        // Write the results to a file
-        const outputPath = path.join(process.cwd(), 'basic_metadata.json');
-        await fs.writeFile(
-            outputPath, 
-            JSON.stringify(metadata, null, 2)
-        );
-
-        console.log(`Basic metadata generated and saved to ${outputPath}`);
-        console.log(`Processed ${metadata.length} files successfully`);
-        return metadata;
-    } catch (error) {
-        console.error('Error generating basic metadata:', error);
-        throw error;
-    }
-}
-
-// Export the function if using as a module
-module.exports = generateBasicMetadata;
-
-// If running directly
-if (require.main === module) {
-    const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata');
-    generateBasicMetadata(metadataDir).catch(console.error);
-}
diff --git a/0_run_workflow.js b/0_run_workflow.js
new file mode 100644
index 0000000..5932947
--- /dev/null
+++ b/0_run_workflow.js
@@ -0,0 +1,57 @@
+// 0_run_workflow.js
+const { execSync } = require("child_process");
+
+// Get CLI arguments
+const args = process.argv.slice(2);
+const getArg = (name) => {
+  const prefix = `--${name}=`;
+  const found = args.find((arg) => arg.startsWith(prefix));
+  return found ? parseInt(found.slice(prefix.length), 10) : undefined;
+};
+
+const startPage = getArg("start-page");
+const endPage = getArg("end-page");
+
+if (!startPage || !endPage || isNaN(startPage) || isNaN(endPage)) {
+  console.error("❌ Missing or invalid arguments. Usage: node 0_run_workflow.js --start-page=3 --end-page=4");
+  process.exit(1);
+}
+
+console.log(`🚀 Starting workflow from page ${startPage} to ${endPage}\n`);
+
+const steps = [
+  {
+    name: "📥 Step 1️⃣: Fetching DOI JSON...",
+    command: `node 1_fetch_all_dois.js --start-page=${startPage} --end-page=${endPage}`,
+  },
+  {
+    name: "📄 Step 2️⃣: Downloading PDFs...",
+    command: `node 2_fetch_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`,
+  },
+  {
+    name: "🧠 Step 3️⃣: Generating metadata...",
+    command: `node 3_generate_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`,
+  },
+  {
+    name: "🆙 Step 4️⃣: Uploading metadata to Irys...",
+    command: `node 4_upload_all_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`,
+  },
+  {
+    name: "📤 Step 5️⃣: Uploading PDFs to Irys...",
+    command: `node 5_upload_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`,
+  },
+];
+
+(async () => {
+  for (const step of steps) {
+    console.log(`\n${step.name}`);
+    try {
+      execSync(step.command, { stdio: "inherit" });
+    } catch (err) {
+      console.error(`❌ Workflow failed: ${err.message}`);
+      process.exit(1);
+    }
+  }
+
+  console.log("\n✅ All steps completed successfully!");
+})();
diff --git a/1_fetch_all_dois.js b/1_fetch_all_dois.js
new file mode 100644
index 0000000..6b60a8f
--- /dev/null
+++ b/1_fetch_all_dois.js
@@ -0,0 +1,88 @@
+const fs = require('fs');
+const axios = require('axios');
+const path = require('path');
+
+const OUTPUT_DIR = path.join(__dirname, 'doi');
+const BASE_URL = 'https://api.scai.sh/dois?page=';
+const TOTAL_PAGES = 883431;
+const DELAY_MS = 2000;
+
+// Parse CLI arguments: --start-page=XX --end-page=XX
+const args = process.argv.slice(2);
+const getArg = (name) => {
+  const prefix = `--${name}=`;
+  const found = args.find(arg => arg.startsWith(prefix));
+  return found ? parseInt(found.slice(prefix.length), 10) : undefined;
+};
+
+const cliStartPage = getArg('start-page');
+const cliEndPage = getArg('end-page');
+
+// Ensure output directory exists
+if (!fs.existsSync(OUTPUT_DIR)) {
+  fs.mkdirSync(OUTPUT_DIR);
+}
+
+// Utility: Delay between requests
+const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
+
+// Get last downloaded page number from existing files
+function getLastDownloadedPage() {
+  const files = fs.readdirSync(OUTPUT_DIR);
+  const pageNumbers = files
+    .map(file => {
+      const match = file.match(/page_(\d+)\.json$/);
+      return match ? parseInt(match[1], 10) : null;
+    })
+    .filter(n => n !== null)
+    .sort((a, b) => a - b);
+  return pageNumbers.length ? pageNumbers[pageNumbers.length - 1] : 0;
+}
+
+// Download a range of pages
+async function downloadAllPages(startPage, endPage) {
+  for (let page = startPage; page <= endPage; page++) {
+    const filePath = path.join(OUTPUT_DIR, `page_${page}.json`);
+    if (fs.existsSync(filePath)) {
+      console.log(`✅ Page ${page} already exists. Skipping.`);
+      continue;
+    }
+
+    const url = `${BASE_URL}${page}`;
+    try {
+      console.log(`🔍 Fetching page ${page}...`);
+      const res = await axios.get(url);
+      const data = res.data;
+
+      if (data && Array.isArray(data.dois)) {
+        fs.writeFileSync(filePath, JSON.stringify(data.dois, null, 2));
+        console.log(`✅ Page ${page} saved (${data.dois.length} DOIs)`);
+      } else {
+        console.warn(`⚠️ Page ${page} response missing 'dois' array. Skipping.`);
+      }
+    } catch (err) {
+      console.error(`❌ Failed to fetch page ${page}: ${err.message}`);
+      console.log('🛑 Stopping script. You can rerun it to resume.');
+      break;
+    }
+
+    await sleep(DELAY_MS);
+  }
+
+  console.log('🎉 Finished fetching pages.');
+}
+
+// Entry point
+async function main() {
+  if (cliStartPage !== undefined && cliEndPage !== undefined) {
+    console.log(`🚀 Running in range mode: page ${cliStartPage} → ${cliEndPage}`);
+    await downloadAllPages(cliStartPage, cliEndPage);
+  } else {
+    const start = getLastDownloadedPage() + 1;
+    const end = TOTAL_PAGES;
+    console.log(`🔁 Resuming from page ${start} → ${end}`);
+    await downloadAllPages(start, end);
+  }
+}
+
+main();
diff --git a/1_upload_basic_metadata.js b/1_upload_basic_metadata.js
deleted file mode 100644
index d0c70bd..0000000
--- a/1_upload_basic_metadata.js
+++ /dev/null
@@ -1,101 +0,0 @@
-require("dotenv").config();
-const { Uploader } = require("@irys/upload");
-const { Solana } = require("@irys/upload-solana");
-const fs = require("fs").promises;
-const path = require("path");
-
-// 初始化上传器
-const getIrysUploader = async () => {
-    try {
-        const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
-        console.log("✅ Irys uploader initialized.");
-        return irysUploader;
-    } catch (error) {
-        console.error("❌ Failed to initialize Irys uploader:", error);
-        return null;
-    }
-};
-
-// 上传 basic_metadata.json 中的数据
-const uploadBasicMetadata = async () => {
-    const irys = await getIrysUploader();
-    if (!irys) {
-        console.error("Irys uploader could not be initialized.");
-        return;
-    }
-
-    try {
-        const filePath = path.join(process.cwd(), 'basic_metadata.json');
-        console.log(`📄 Reading file: ${filePath}`);
-        
-        const content = await fs.readFile(filePath, 'utf8');
-        const papers = JSON.parse(content);
-        
-        console.log(`📚 Loaded ${papers.length} papers for processing`);
-
-        let successCount = 0;
-        let failCount = 0;
-
-        for (let i = 0; i < papers.length; i++) {
-            const paper = papers[i];
-            console.log(`\n📄 Processing paper [${i + 1}/${papers.length}]`);
-
-            if (!paper.doi) {
-                console.log(`⚠️ Skipping paper: No DOI found`);
-                failCount++;
-                continue;
-            }
-
-            try {
-                const normalizedDoi = paper.doi.trim();
-                const normalizedTitle = (paper.title || "")
-                    .replace(/\s+/g, ' ')
-                    .replace(/\n/g, '')
-                    .trim();
-
-                const normalizedAuthors = (paper.authors || "")
-                    .replace(/\s+/g, ' ')
-                    .replace(/\n/g, '')
-                    .trim();
-
-                const tags = [
-                    { name: "App-Name", value: "scivault" },
-                    { name: "Content-Type", value: "application/json" },
-                    { name: "Version", value: "2.0.0" },
-                    { name: "doi", value: normalizedDoi },
-                    { name: "title", value: normalizedTitle },
-                    { name: "authors", value: normalizedAuthors },
-                    { name: "aid", value: paper.aid || "" }
-                ];
-
-                const paperMetadata = Buffer.from(JSON.stringify(paper));
-                const receipt = await irys.upload(paperMetadata, { tags });
-
-                console.log(`✅ Uploaded: ${normalizedDoi} (${receipt.id})`);
-                successCount++;
-
-            } catch (error) {
-                console.error(`❌ Failed: ${paper.doi} - ${error.message}`);
-                failCount++;
-            }
-
-            if ((i + 1) % 10 === 0 || i === papers.length - 1) {
-                console.log(`\n📊 Progress Report:`);
-                console.log(`   ✅ Success: ${successCount}`);
-                console.log(`   ❌ Failed: ${failCount}`);
-                console.log(`   🔄 Progress: ${Math.round((i + 1) / papers.length * 100)}%`);
-            }
-        }
-
-        console.log(`\n✨ Upload Complete`);
-        console.log(`   ✅ Total Success: ${successCount}`);
-        console.log(`   ❌ Total Failed: ${failCount}`);
-        console.log(`   📈 Success Rate: ${Math.round(successCount / papers.length * 100)}%`);
-
-    } catch (error) {
-        console.error("❌ Error uploading metadata:", error);
-    }
-};
-
-// 执行上传
-uploadBasicMetadata().catch(console.error);
diff --git a/2_fetch_all_pdfs.js b/2_fetch_all_pdfs.js
new file mode 100644
index 0000000..d05bd18
--- /dev/null
+++ b/2_fetch_all_pdfs.js
@@ -0,0 +1,165 @@
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const axios = require('axios');
+
+// === Configuration ===
+const DOI_DIR = './doi';
+const PDF_DIR = './pdf';
+const SCI_HUB_MIRRORS = [
+  'https://sci-hub.st/',
+  'https://sci-hub.se/',
+  'https://sci-hub.ru/',
+  'https://www.tesble.com/',
+];
+const DELAY_MS = 3000;
+const MIN_VALID_SIZE = 1024;
+
+// === CLI Argument Parser ===
+const args = process.argv.slice(2);
+const getArg = (name) => {
+  const prefix = `--${name}=`;
+  const found = args.find(arg => arg.startsWith(prefix));
+  return found ? parseInt(found.slice(prefix.length), 10) : undefined;
+};
+const cliStart = getArg("start-page");
+const cliEnd = getArg("end-page");
+
+// === Utility Functions ===
+const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
+function ensureDir(dirPath) {
+  if (!fs.existsSync(dirPath)) fs.mkdirSync(dirPath, { recursive: true });
+}
+
+async function downloadPdfFromUrl(url, filePath) {
+  try {
+    const writer = fs.createWriteStream(filePath);
+    const response = await axios({
+      url,
+      method: 'GET',
+      responseType: 'stream',
+      httpsAgent: new https.Agent({ rejectUnauthorized: false })
+    });
+
+    response.data.pipe(writer);
+    return new Promise((resolve, reject) => {
+      writer.on('finish', () => {
+        const stats = fs.statSync(filePath);
+        if (stats.size >= MIN_VALID_SIZE) {
+          console.log(`✅ Downloaded: ${url}`);
+          resolve(true);
+        } else {
+          fs.unlinkSync(filePath);
+          console.warn(`❌ Download too small: ${url}`);
+          resolve(false);
+        }
+      });
+      writer.on('error', reject);
+    });
+  } catch (err) {
+    console.error(`❌ Download failed: ${url}`, err.message);
+    return false;
+  }
+}
+
+async function extractPdfLinkAndDownload(doi, mirror, outputPath) {
+  try {
+    const url = mirror + encodeURIComponent(doi);
+    const response = await axios.get(url, { httpsAgent: new https.Agent({ rejectUnauthorized: false }) });
+    const html = response.data;
+
+    const embedMatch = html.match(/<embed[^>]*src=["']([^"']+\.pdf[^"']*)["']/i);
+    if (!embedMatch || !embedMatch[1]) {
+      console.warn(`❌ No PDF embed found for ${doi}`);
+      return false;
+    }
+
+    let pdfUrl = embedMatch[1];
+    if (pdfUrl.startsWith('//')) {
+      pdfUrl = 'https:' + pdfUrl;
+    } else if (!pdfUrl.startsWith('http')) {
+      pdfUrl = mirror + (pdfUrl.startsWith('/') ? pdfUrl.slice(1) : pdfUrl);
+    }
+
+    return await downloadPdfFromUrl(pdfUrl, outputPath);
+  } catch (err) {
+    console.warn(`❌ Error scraping ${mirror} for ${doi}: ${err.message}`);
+    return false;
+  }
+}
+
+async function tryAllMirrors(doi, outputPath) {
+  for (const mirror of SCI_HUB_MIRRORS) {
+    const success = await extractPdfLinkAndDownload(doi, mirror, outputPath);
+    if (success) return true;
+    await sleep(1000);
+  }
+  return false;
+}
+
+async function processPage(pageFile) {
+  const pageNum = pageFile.match(/\d+/)[0];
+  const doiPath = path.join(DOI_DIR, pageFile);
+  const outDir = path.join(PDF_DIR, `page_${pageNum}`);
+  ensureDir(outDir);
+
+  const failedLogPath = path.join(outDir, `failed_log_page_${pageNum}.txt`);
+  let failedDois = new Set();
+  if (fs.existsSync(failedLogPath)) {
+    failedDois = new Set(fs.readFileSync(failedLogPath, 'utf8').split('\n').filter(Boolean));
+  }
+
+  const dois = JSON.parse(fs.readFileSync(doiPath, 'utf8'));
+
+  for (const doi of dois) {
+    const doiSafe = encodeURIComponent(doi);
+    const pdfPath = path.join(outDir, `${doiSafe}.pdf`);
+
+    if (fs.existsSync(pdfPath)) {
+      const stats = fs.statSync(pdfPath);
+      if (stats.size >= MIN_VALID_SIZE) {
+        console.log(`✅ Already exists: ${pdfPath}`);
+        continue;
+      } else {
+        console.warn(`⚠️ Removing invalid file: ${pdfPath}`);
+        fs.unlinkSync(pdfPath);
+      }
+    }
+
+    if (failedDois.has(doi)) {
+      console.log(`⚠️ Previously failed: ${doi}, skipping`);
+      continue;
+    }
+
+    console.log(`📄 Downloading DOI: ${doi}`);
+    const success = await tryAllMirrors(doi, pdfPath);
+    if (!success) {
+      fs.appendFileSync(failedLogPath, `${doi}\n`);
+      console.error(`❌ Failed to download ${doi}`);
+    }
+
+    await sleep(DELAY_MS);
+  }
+}
+
+async function main() {
+  ensureDir(PDF_DIR);
+
+  const pageFiles = fs.readdirSync(DOI_DIR)
+    .filter(f => f.startsWith('page_') && f.endsWith('.json'))
+    .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0]));
+
+  const filtered = pageFiles.filter(f => {
+    const page = parseInt(f.match(/\d+/)[0], 10);
+    return (!cliStart || page >= cliStart) && (!cliEnd || page <= cliEnd);
+  });
+
+  for (const file of filtered) {
+    console.log(`\n=== Processing ${file} ===`);
+    await processPage(file);
+  }
+
+  console.log('\n🎉 All requested PDF downloads finished.');
+}
+
+main();
diff --git a/2_upload_pdf.js b/2_upload_pdf.js
deleted file mode 100644
index e6de4bf..0000000
--- a/2_upload_pdf.js
+++ /dev/null
@@ -1,198 +0,0 @@
-require("dotenv").config();
-const { Uploader } = require("@irys/upload");
-const { Solana } = require("@irys/upload-solana");
-const fs = require("fs").promises;
-const path = require("path");
-
-// 初始化 Irys 上传器
-const getIrysUploader = async () => {
-    try {
-        const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
-        console.log("✅ Irys uploader initialized.");
-        return irysUploader;
-    } catch (error) {
-        console.error("❌ Failed to initialize Irys uploader:", error);
-        return null;
-    }
-};
-
-// 遍历目录查找 PDF 文件
-async function walkDir(dir) {
-    try {
-        const files = await fs.readdir(dir);
-        const pdfFiles = files.filter(file => file.toLowerCase().endsWith('.pdf'));
-        return pdfFiles.map(file => path.join(dir, file));
-    } catch (error) {
-        console.error('Error reading directory:', error);
-        throw error;
-    }
-}
-
-// 读取 JSON 元数据文件中的 DOI
-async function getDoiFromMetadata(pdfPath) {
-    try {
-        const jsonPath = pdfPath.replace('.pdf', '.json');
-        console.log(`🔍 Looking for metadata file: ${jsonPath}`);
-        const jsonData = await fs.readFile(jsonPath, 'utf8');
-        const metadata = JSON.parse(jsonData);
-        if (!metadata.doi) throw new Error(`No DOI found in metadata file: ${jsonPath}`);
-        console.log(`✅ Found DOI: ${metadata.doi}`);
-        return metadata.doi;
-    } catch (error) {
-        console.error(`❌ Error getting DOI from metadata:`, error);
-        throw error;
-    }
-}
-
-// 上传单个 PDF（不再切片）
-const uploadPdf = async (inputPath, doi) => {
-    try {
-        console.log(`\n📄 Processing PDF: ${path.basename(inputPath)}`);
-
-        // 1. 检查是否已上传过
-        const query = `
-            query {
-                transactions(
-                    tags: [
-                        { name: "App-Name", values: ["scivault"] },
-                        { name: "Content-Type", values: ["application/pdf"] },
-                        { name: "Version", values: ["2.0.0"] },
-                        { name: "doi", values: ["${doi}"] }
-                    ]
-                ) {
-                    edges {
-                        node { id }
-                    }
-                }
-            }
-        `;
-
-        const response = await fetch("https://uploader.irys.xyz/graphql", {
-            method: "POST",
-            headers: { "Content-Type": "application/json" },
-            body: JSON.stringify({ query })
-        });
-
-        const result = await response.json();
-        if (result.data?.transactions?.edges?.[0]?.node?.id) {
-            console.log(`⚠️ PDF already uploaded for DOI: ${doi}`);
-            return result.data.transactions.edges.map(edge => edge.node.id);
-        }
-
-        // 2. 上传 PDF
-        const irys = await getIrysUploader();
-        if (!irys) throw new Error("Failed to initialize Irys uploader");
-
-        const buffer = await fs.readFile(inputPath);
-        const tags = [
-            { name: "App-Name", value: "scivault" },
-            { name: "Content-Type", value: "application/pdf" },
-            { name: "Version", value: "2.0.0" },
-            { name: "doi", value: doi }
-        ];
-
-        const receipt = await irys.upload(buffer, { tags });
-        console.log(`✅ PDF uploaded successfully. Transaction ID: ${receipt.id}`);
-        return [receipt.id];
-
-    } catch (error) {
-        console.error(`❌ Error uploading PDF: ${error.message}`);
-        throw error;
-    }
-};
-
-// 错误记录
-async function logError(filePath, error, doi = null) {
-    const errorLogPath = path.join(process.cwd(), 'upload_errors.json');
-    try {
-        let errorLog = [];
-        try {
-            const existingLog = await fs.readFile(errorLogPath, 'utf8');
-            errorLog = JSON.parse(existingLog);
-        } catch {}
-
-        errorLog.push({
-            timestamp: new Date().toISOString(),
-            file: filePath,
-            doi: doi,
-            error: error.message || String(error),
-            stack: error.stack
-        });
-
-        await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2));
-        console.log(`📝 Error logged to ${errorLogPath}`);
-    } catch (logError) {
-        console.error('❌ Failed to log error:', logError);
-    }
-}
-
-// 批量上传 PDF 主函数
-const uploadPdfs = async (pdfDir) => {
-    try {
-        const files = await walkDir(pdfDir);
-        console.log(`\n📁 Found ${files.length} PDF files in ${pdfDir}`);
-
-        let successCount = 0;
-        let failCount = 0;
-        let errorFiles = [];
-
-        for (let i = 0; i < files.length; i++) {
-            const pdfFile = files[i];
-            let doi = null;
-            try {
-                doi = await getDoiFromMetadata(pdfFile);
-                await uploadPdf(pdfFile, doi);
-                successCount++;
-            } catch (error) {
-                failCount++;
-                await logError(pdfFile, error, doi);
-                errorFiles.push({ file: pdfFile, doi: doi, error: error.message });
-            }
-
-            if ((i + 1) % 5 === 0 || i === files.length - 1) {
-                console.log(`\n📊 Progress Report:`);
-                console.log(`   ✅ Success: ${successCount}`);
-                console.log(`   ❌ Failed: ${failCount}`);
-                console.log(`   🔄 Progress: ${Math.round((i + 1) / files.length * 100)}%`);
-            }
-        }
-
-        // 写入报告
-        const report = {
-            timestamp: new Date().toISOString(),
-            totalFiles: files.length,
-            successCount,
-            failCount,
-            successRate: `${Math.round(successCount / files.length * 100)}%`,
-            failedFiles: errorFiles
-        };
-
-        const reportPath = path.join(process.cwd(), 'upload_report.json');
-        await fs.writeFile(reportPath, JSON.stringify(report, null, 2));
-
-        console.log(`\n🎉 Upload Complete`);
-        console.log(`   ✅ Total Success: ${successCount}`);
-        console.log(`   ❌ Total Failed: ${failCount}`);
-        console.log(`   📄 Report saved to: ${reportPath}`);
-        if (failCount > 0) {
-            console.log(`   📌 Error log saved to: upload_errors.json`);
-        }
-
-    } catch (error) {
-        console.error("❌ Error in upload process:", error);
-        await logError('global', error);
-    }
-};
-
-// CLI 执行入口
-if (require.main === module) {
-    const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata');
-    uploadPdfs(metadataDir).catch(console.error);
-}
-
-// 可导出函数供其他模块调用
-module.exports = {
-    getIrysUploader,
-    uploadPdf,
-    uploadPdfs
-};
diff --git a/3_generate_basic_metadata.js b/3_generate_basic_metadata.js
new file mode 100644
index 0000000..26335f5
--- /dev/null
+++ b/3_generate_basic_metadata.js
@@ -0,0 +1,99 @@
+const fs = require('fs');
+const path = require('path');
+const axios = require('axios');
+
+// === Configuration ===
+const PDF_BASE_DIR = './pdf';
+const OPENALEX_BASE_URL = 'https://api.openalex.org/works/doi:';
+const DELAY_MS = 1500;
+
+// === CLI Argument Parser ===
+const args = process.argv.slice(2);
+const getArg = (name) => {
+  const prefix = `--${name}=`;
+  const found = args.find(arg => arg.startsWith(prefix));
+  return found ? parseInt(found.slice(prefix.length), 10) : undefined;
+};
+const cliStart = getArg("start-page");
+const cliEnd = getArg("end-page");
+
+// === Utilities ===
+const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
+
+// Convert inverted index to plain abstract text
+const parseAbstract = (index) => {
+  if (!index || typeof index !== 'object') return '';
+  const words = [];
+  for (const [word, positions] of Object.entries(index)) {
+    positions.forEach(pos => {
+      words[pos] = word;
+    });
+  }
+  return words.join(' ');
+};
+
+// Extract only essential metadata fields
+const extractMetadata = (data) => {
+  const title = data.title || data.display_name || '';
+  const authors = (data.authorships || [])
+    .map(a => a.author?.display_name)
+    .filter(Boolean)
+    .join(', ');
+  const abstract = parseAbstract(data.abstract_inverted_index);
+  const doi = data.doi?.replace('https://doi.org/', '') || '';
+  const aid = data.id?.replace('https://openalex.org/', '') || '';
+  return { title, authors, abstract, doi, aid };
+};
+
+// Process all PDFs in a single page folder
+async function generateMetadataForPage(pageDir) {
+  const pageNum = pageDir.match(/\d+/)[0];
+  console.log(`\n📁 Processing folder: page_${pageNum}`);
+
+  const pdfFiles = fs.readdirSync(pageDir).filter(f => f.endsWith('.pdf'));
+  const metadataList = [];
+
+  for (const file of pdfFiles) {
+    const doiEncoded = file.replace(/\.pdf$/, '');
+    const doi = decodeURIComponent(doiEncoded);
+    const openalexUrl = `${OPENALEX_BASE_URL}${doi}`;
+
+    try {
+      console.log(`🔍 Fetching metadata for DOI: ${doi}`);
+      const response = await axios.get(openalexUrl);
+      const metadata = extractMetadata(response.data);
+      metadataList.push(metadata);
+    } catch (error) {
+      console.warn(`⚠️ Failed to fetch metadata for ${doi}: ${error.message}`);
+    }
+
+    await sleep(DELAY_MS);
+  }
+
+  const outputPath = path.join(pageDir, 'basic_metadata.json');
+  fs.writeFileSync(outputPath, JSON.stringify(metadataList, null, 2));
+  console.log(`✅ Saved metadata to ${outputPath}`);
+}
+
+// === Main Function ===
+async function main() {
+  const subdirs = fs.readdirSync(PDF_BASE_DIR)
+    .filter(d => d.startsWith('page_'))
+    .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0]))
+    .filter(d => {
+      const page = parseInt(d.match(/\d+/)[0], 10);
+      if (cliStart && page < cliStart) return false;
+      if (cliEnd && page > cliEnd) return false;
+      return true;
+    })
+    .map(d => path.join(PDF_BASE_DIR, d))
+    .filter(d => fs.statSync(d).isDirectory());
+
+  for (const pageDir of subdirs) {
+    await generateMetadataForPage(pageDir);
+  }
+
+  console.log('\n🎉 Metadata generation completed for all selected folders.');
+}
+
+main();
diff --git a/3_upload_all_metadata.js b/3_upload_all_metadata.js
deleted file mode 100644
index 2947955..0000000
--- a/3_upload_all_metadata.js
+++ /dev/null
@@ -1,200 +0,0 @@
-require("dotenv").config();
-const { Uploader } = require("@irys/upload");
-const { Solana } = require("@irys/upload-solana");
-const fs = require("fs").promises;
-const path = require("path");
-
-const getIrysUploader = async () => {
-    try {
-        const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
-        console.log("Irys uploader initialized.");
-        return irysUploader;
-    } catch (error) {
-        console.error("Failed to initialize Irys uploader:", error);
-        return null;
-    }
-};
-
-async function walkDir(dir) {
-    try {
-        const files = await fs.readdir(dir);
-        const jsonFiles = files.filter(file => file.toLowerCase().endsWith('.json'));
-        return jsonFiles.map(file => path.join(dir, file));
-    } catch (error) {
-        console.error('Error reading directory:', error);
-        throw error;
-    }
-}
-
-async function uploadMetadata(jsonPath) {
-    try {
-        console.log(`\n📄 Processing metadata: ${path.basename(jsonPath)}`);
-        
-        // Read and parse JSON file
-        const jsonData = await fs.readFile(jsonPath, 'utf8');
-        const metadata = JSON.parse(jsonData);
-        
-        if (!metadata.doi) {
-            throw new Error(`No DOI found in metadata file: ${jsonPath}`);
-        }
-
-        // Check if metadata was already uploaded
-        const query = `
-            query {
-                transactions(
-                    tags: [
-                        { name: "Content-Type", values: ["metadata/json"] },
-                        { name: "App-Name", values: ["scivault"] },
-                        { name: "Version", values: ["1.0.3"] },
-                        { name: "doi", values: ["${metadata.doi}"] }
-                    ]
-                ) {
-                    edges {
-                        node {
-                            id
-                        }
-                    }
-                }
-            }
-        `;
-
-        const response = await fetch("https://uploader.irys.xyz/graphql", {
-            method: "POST",
-            headers: { "Content-Type": "application/json" },
-            body: JSON.stringify({ query })
-        });
-
-        const result = await response.json();
-        if (result.data?.transactions?.edges?.[0]?.node?.id) {
-            console.log(`⚠️ Metadata already exists for DOI: ${metadata.doi}`);
-            return result.data.transactions.edges[0].node.id;
-        }
-
-        // Upload metadata
-        const irys = await getIrysUploader();
-        if (!irys) {
-            throw new Error("Failed to initialize Irys uploader");
-        }
-
-        const tags = [
-            { name: "Content-Type", value: "metadata/json" },
-            { name: "App-Name", value: "scivault" },
-            { name: "Version", value: "1.0.3" }
-        ];
-
-        for (const [key, value] of Object.entries(metadata)) {
-            if (value && typeof value === 'string') {
-                tags.push({ name: key, value: value });
-            }
-        }
-
-        const receipt = await irys.upload(jsonData, { tags });
-        console.log(`✅ Metadata uploaded: ${receipt.id}`);
-        return receipt.id;
-
-    } catch (error) {
-        console.error(`❌ Error processing metadata: ${error.message}`);
-        throw error;
-    }
-}
-
-async function logError(filePath, error, doi = null) {
-    const errorLogPath = path.join(process.cwd(), 'metadata_upload_errors.json');
-    try {
-        let errorLog = [];
-        try {
-            const existingLog = await fs.readFile(errorLogPath, 'utf8');
-            errorLog = JSON.parse(existingLog);
-        } catch (e) {
-            // File doesn't exist, use empty array
-        }
-
-        errorLog.push({
-            timestamp: new Date().toISOString(),
-            file: filePath,
-            doi: doi,
-            error: error.message || String(error),
-            stack: error.stack
-        });
-
-        await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2));
-        console.log(`Error logged to ${errorLogPath}`);
-    } catch (logError) {
-        console.error('Failed to log error:', logError);
-    }
-}
-
-const uploadAllMetadata = async (metadataDir) => {
-    try {
-        const files = await walkDir(metadataDir);
-        console.log(`Found ${files.length} JSON files in ${metadataDir}`);
-
-        let successCount = 0;
-        let failCount = 0;
-        let errorFiles = [];
-
-        for (let i = 0; i < files.length; i++) {
-            const jsonFile = files[i];
-            let doi = null;
-            try {
-                const jsonData = await fs.readFile(jsonFile, 'utf8');
-                const metadata = JSON.parse(jsonData);
-                doi = metadata.doi;
-
-                await uploadMetadata(jsonFile);
-                successCount++;
-            } catch (error) {
-                failCount++;
-                await logError(jsonFile, error, doi);
-                errorFiles.push({
-                    file: jsonFile,
-                    doi: doi,
-                    error: error.message
-                });
-            }
-
-            if ((i + 1) % 5 === 0 || i === files.length - 1) {
-                console.log(`\n📊 Progress Report:`);
-                console.log(`   Success: ${successCount}`);
-                console.log(`   Failed: ${failCount}`);
-                console.log(`   Progress: ${Math.round((i + 1) / files.length * 100)}%`);
-            }
-        }
-
-        const report = {
-            timestamp: new Date().toISOString(),
-            totalFiles: files.length,
-            successCount,
-            failCount,
-            successRate: `${Math.round(successCount / files.length * 100)}%`,
-            failedFiles: errorFiles
-        };
-
-        const reportPath = path.join(process.cwd(), 'metadata_upload_report.json');
-        await fs.writeFile(reportPath, JSON.stringify(report, null, 2));
-
-        console.log(`\n🎉 Upload Complete`);
-        console.log(`   Total Success: ${successCount}`);
-        console.log(`   Total Failed: ${failCount}`);
-        console.log(`   Success Rate: ${Math.round(successCount / files.length * 100)}%`);
-        console.log(`   Detailed report saved to: ${reportPath}`);
-        if (failCount > 0) {
-            console.log(`   Error log saved to: metadata_upload_errors.json`);
-        }
-
-    } catch (error) {
-        console.error("❌ Error in upload process:", error);
-        await logError('global', error);
-    }
-};
-
-if (require.main === module) {
-    const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata');
-    uploadAllMetadata(metadataDir).catch(console.error);
-}
-
-module.exports = {
-    getIrysUploader,
-    uploadMetadata,
-    uploadAllMetadata
-};
diff --git a/4_upload_all_basic_metadata.js b/4_upload_all_basic_metadata.js
new file mode 100644
index 0000000..7184ca2
--- /dev/null
+++ b/4_upload_all_basic_metadata.js
@@ -0,0 +1,131 @@
+require("dotenv").config();
+const { Uploader } = require("@irys/upload");
+const { Solana } = require("@irys/upload-solana");
+const fs = require("fs").promises;
+const path = require("path");
+
+// === Configuration ===
+const PDF_BASE_DIR = './pdf';
+const REPORT_FILENAME = 'upload_basic_metadata_report.txt';
+
+// === CLI Argument Parser ===
+const args = process.argv.slice(2);
+const getArg = (name) => {
+  const prefix = `--${name}=`;
+  const found = args.find(arg => arg.startsWith(prefix));
+  return found ? parseInt(found.slice(prefix.length), 10) : undefined;
+};
+const cliStart = getArg("start-page");
+const cliEnd = getArg("end-page");
+
+// === Initialize Irys uploader ===
+const getIrysUploader = async () => {
+  try {
+    const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
+    console.log("✅ Irys uploader initialized.");
+    return irysUploader;
+  } catch (error) {
+    console.error("❌ Failed to initialize Irys uploader:", error);
+    return null;
+  }
+};
+
+// === Upload a single paper ===
+const uploadOneMetadata = async (irys, paper, pageNum, index) => {
+  if (!paper.doi) {
+    console.log(`⚠️ Skipping paper at page ${pageNum}, index ${index}: No DOI`);
+    return { ok: false, reason: 'no-doi' };
+  }
+
+  try {
+    const normalizedDoi = paper.doi.trim();
+    const normalizedTitle = (paper.title || "").replace(/\s+/g, ' ').trim();
+    const normalizedAuthors = (paper.authors || "").replace(/\s+/g, ' ').trim();
+
+    const tags = [
+      { name: "App-Name", value: "scivault" },
+      { name: "Content-Type", value: "application/json" },
+      { name: "Version", value: "2.0.0" },
+      { name: "doi", value: normalizedDoi },
+      { name: "title", value: normalizedTitle },
+      { name: "authors", value: normalizedAuthors },
+      { name: "aid", value: paper.aid || "" }
+    ];
+
+    const buffer = Buffer.from(JSON.stringify(paper));
+    const receipt = await irys.upload(buffer, { tags });
+
+    console.log(`✅ Uploaded [page_${pageNum} - ${index}]: ${normalizedDoi} (${receipt.id})`);
+    return { ok: true, id: receipt.id };
+  } catch (err) {
+    console.error(`❌ Upload failed [page_${pageNum} - ${index}]: ${paper.doi} - ${err.message}`);
+    return { ok: false, reason: err.message };
+  }
+};
+
+// === Process one page folder ===
+const uploadPageFolder = async (irys, pageDir) => {
+  const pageNum = pageDir.match(/\d+/)?.[0] || '?';
+  const metaPath = path.join(PDF_BASE_DIR, pageDir, 'basic_metadata.json');
+  const reportPath = path.join(PDF_BASE_DIR, pageDir, REPORT_FILENAME);
+
+  try {
+    await fs.access(metaPath);
+  } catch {
+    console.warn(`⚠️ Skipping page_${pageNum}: no basic_metadata.json`);
+    return;
+  }
+
+  const jsonText = await fs.readFile(metaPath, 'utf8');
+  const papers = JSON.parse(jsonText);
+
+  console.log(`\n📄 Found ${papers.length} papers in page_${pageNum}`);
+  const reportLines = [];
+
+  let success = 0;
+  let fail = 0;
+
+  for (let i = 0; i < papers.length; i++) {
+    const result = await uploadOneMetadata(irys, papers[i], pageNum, i);
+    const doi = papers[i].doi || '[no-doi]';
+
+    if (result.ok) {
+      success++;
+      reportLines.push(`✅ ${doi} : ${result.id}`);
+    } else {
+      fail++;
+      reportLines.push(`❌ ${doi} : ${result.reason}`);
+    }
+
+    if ((i + 1) % 10 === 0 || i === papers.length - 1) {
+      console.log(`📊 page_${pageNum} progress: ${i + 1}/${papers.length}, ✅ ${success}, ❌ ${fail}`);
+    }
+  }
+
+  await fs.writeFile(reportPath, reportLines.join('\n'), 'utf8');
+  console.log(`📄 Upload report saved: ${reportPath}`);
+  console.log(`✨ Finished page_${pageNum}: ✅ ${success}, ❌ ${fail}`);
+};
+
+// === Main Execution ===
+(async () => {
+  const irys = await getIrysUploader();
+  if (!irys) return;
+
+  const dirs = await fs.readdir(PDF_BASE_DIR);
+  const pageDirs = dirs
+    .filter(d => d.startsWith('page_'))
+    .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0]))
+    .filter(d => {
+      const page = parseInt(d.match(/\d+/)[0], 10);
+      if (cliStart && page < cliStart) return false;
+      if (cliEnd && page > cliEnd) return false;
+      return true;
+    });
+
+  for (const pageDir of pageDirs) {
+    await uploadPageFolder(irys, pageDir);
+  }
+
+  console.log('\n🎉 All basic metadata uploads completed.');
+})();
diff --git a/5_upload_all_pdfs.js b/5_upload_all_pdfs.js
new file mode 100644
index 0000000..a38880c
--- /dev/null
+++ b/5_upload_all_pdfs.js
@@ -0,0 +1,168 @@
+require("dotenv").config();
+const { Uploader } = require("@irys/upload");
+const { Solana } = require("@irys/upload-solana");
+const fs = require("fs").promises;
+const path = require("path");
+
+// === CONFIG ===
+const BASE_PDF_DIR = path.join(process.cwd(), "pdf");
+const MIN_VALID_SIZE = 1000; // in bytes
+const REPORT_PREFIX = "upload_pdf_report";
+
+// === CLI ===
+const args = process.argv.slice(2);
+const getArg = (name) => {
+  const prefix = `--${name}=`;
+  const found = args.find(arg => arg.startsWith(prefix));
+  return found ? parseInt(found.slice(prefix.length), 10) : undefined;
+};
+const cliStart = getArg("start-page");
+const cliEnd = getArg("end-page");
+
+// === Uploader ===
+const getIrysUploader = async () => {
+  try {
+    const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY);
+    console.log("✅ Irys uploader initialized.");
+    return irysUploader;
+  } catch (error) {
+    console.error("❌ Failed to initialize Irys uploader:", error);
+    return null;
+  }
+};
+
+// === DOI Utilities ===
+function extractDoiFromFilename(filename) {
+  const base = path.basename(filename, ".pdf");
+  return decodeURIComponent(base).replace(/%2F/g, "/").trim();
+}
+
+// === Check existing upload ===
+async function checkIfAlreadyUploaded(doi) {
+  const query = `
+    query {
+      transactions(
+        tags: [
+          { name: "App-Name", values: ["scivault"] },
+          { name: "Content-Type", values: ["application/pdf"] },
+          { name: "Version", values: ["2.0.0"] },
+          { name: "doi", values: ["${doi}"] }
+        ]
+      ) {
+        edges {
+          node { id }
+        }
+      }
+    }
+  `;
+
+  const response = await fetch("https://uploader.irys.xyz/graphql", {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify({ query })
+  });
+
+  const result = await response.json();
+  return result.data?.transactions?.edges?.[0]?.node?.id || null;
+}
+
+// === Upload one PDF ===
+async function uploadOnePdf(irys, filePath) {
+  try {
+    const doi = extractDoiFromFilename(filePath);
+    if (!doi) throw new Error("Invalid DOI from filename");
+
+    const alreadyUploaded = await checkIfAlreadyUploaded(doi);
+    if (alreadyUploaded) {
+      console.log(`⚠️ Already uploaded: ${doi}`);
+      return { status: "skip", doi };
+    }
+
+    const buffer = await fs.readFile(filePath);
+    if (buffer.length < MIN_VALID_SIZE) {
+      throw new Error("File too small (<1KB)");
+    }
+
+    const tags = [
+      { name: "App-Name", value: "scivault" },
+      { name: "Content-Type", value: "application/pdf" },
+      { name: "Version", value: "2.0.0" },
+      { name: "doi", value: doi }
+    ];
+
+    const receipt = await irys.upload(buffer, { tags });
+    console.log(`✅ Uploaded ${doi} - ${receipt.id}`);
+    return { status: "ok", doi, id: receipt.id };
+  } catch (error) {
+    console.error(`❌ Failed upload: ${filePath} - ${error.message}`);
+    return { status: "fail", file: filePath, error: error.message };
+  }
+}
+
+// === Process one page folder ===
+async function processPageFolder(irys, pageDir) {
+  const pageNum = pageDir.match(/page_(\d+)/)?.[1];
+  const files = await fs.readdir(pageDir);
+  const pdfFiles = files.filter(f => f.endsWith(".pdf"));
+
+  console.log(`📂 Processing page_${pageNum} - Found ${pdfFiles.length} PDFs`);
+
+  const result = { ok: [], fail: [], skip: [] };
+
+  for (let i = 0; i < pdfFiles.length; i++) {
+    const file = pdfFiles[i];
+    const filePath = path.join(pageDir, file);
+    const res = await uploadOnePdf(irys, filePath);
+
+    if (res.status === "ok") result.ok.push(res);
+    else if (res.status === "fail") result.fail.push(res);
+    else if (res.status === "skip") result.skip.push(res);
+
+    if ((i + 1) % 10 === 0 || i === pdfFiles.length - 1) {
+      console.log(`📊 Progress: ${i + 1}/${pdfFiles.length}`);
+    }
+  }
+
+  // Save report
+  const report = {
+    page: `page_${pageNum}`,
+    timestamp: new Date().toISOString(),
+    total: pdfFiles.length,
+    success: result.ok.length,
+    failed: result.fail.length,
+    skipped: result.skip.length,
+    successRate: `${Math.round((result.ok.length / pdfFiles.length) * 100)}%`,
+    details: result
+  };
+
+  const reportPath = path.join(pageDir, `${REPORT_PREFIX}_page_${pageNum}.json`);
+  await fs.writeFile(reportPath, JSON.stringify(report, null, 2));
+  console.log(`📝 Report saved to ${reportPath}`);
+}
+
+// === Main ===
+(async () => {
+  const irys = await getIrysUploader();
+  if (!irys) return;
+
+  const dirs = await fs.readdir(BASE_PDF_DIR);
+  const pageDirs = dirs
+    .filter(d => d.startsWith("page_"))
+    .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0]))
+    .filter(d => {
+      const page = parseInt(d.match(/\d+/)[0]);
+      if (cliStart && page < cliStart) return false;
+      if (cliEnd && page > cliEnd) return false;
+      return true;
+    });
+
+  for (const dir of pageDirs) {
+    const fullPath = path.join(BASE_PDF_DIR, dir);
+    const stat = await fs.lstat(fullPath);
+    if (stat.isDirectory()) {
+      await processPageFolder(irys, fullPath);
+    }
+  }
+
+  console.log("\n🎉 All PDF uploads completed.");
+})();
diff --git a/fund.js b/9_fund.js
similarity index 100%
rename from fund.js
rename to 9_fund.js
diff --git a/package.json b/package.json
index a2fb714..03008c1 100644
--- a/package.json
+++ b/package.json
@@ -12,13 +12,17 @@
   "dependencies": {
     "@irys/upload": "^0.0.14",
     "@irys/upload-solana": "^0.1.7",
+    "axios": "^1.9.0",
     "bignumber.js": "^9.1.2",
     "cors": "^2.8.5",
     "dotenv": "^16.4.7",
     "express": "^4.17.1",
+    "jsdom": "^26.1.0",
+    "minimist": "^1.2.8",
     "node-fetch": "^2.7.0",
     "pdf-lib": "^1.17.1",
-    "pdfkit": "^0.16.0"
+    "pdfkit": "^0.16.0",
+    "puppeteer": "^24.10.0"
   },
   "keywords": [
     "arweave",
@@ -29,7 +33,6 @@
   ],
   "author": "SciVault",
   "license": "MIT",
-  "devDependencies": {},
   "repository": {
     "type": "git",
     "url": "git+https://github.com/Scihub-Community/sciuploader.git"

From adda4ddbc9e0dd86dcd729bcbfc767420957765b Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Tue, 3 Jun 2025 19:47:11 +0900
Subject: [PATCH 5/8] Refactor: restructure script flow and replace old upload
 logic

---
 README.md | 127 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 69 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 1715ff0..2d851b9 100644
--- a/README.md
+++ b/README.md
@@ -1,86 +1,97 @@
-# uploader for SciBox
+# 📄 SciUploader – Bulk Sci-Hub PDF Downloader
 
-A decentralized academic paper repository system built on Arweave/Irys.
+This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage (e.g., Arweave/Irys).
 
-## Prerequisites
+---
 
-1. Node.js (v16 or higher)
-2. Solana wallet with SOL tokens
-3. Create a `.env` file with your Solana private key:
-   ```
-   PRIVATE_KEY=your_solana_private_key_here
-   ```
+## 📦 Project Structure
 
-## Installation
+```
+sciuploader/
+├── doi/                            ← Each page_N.json contains a list of DOIs
+├── pdf/                            ← Downloaded PDFs organized by page
+├── 0_run_workflow.js              ← Run full workflow script
+├── 1_fetch_all_dois.js            ← Fetch DOI list from external source
+├── 2_fetch_all_pdfs.js            ← Download PDFs using DOI list
+├── 3_generate_basic_metadata.js   ← Generate basic metadata JSON
+├── 4_upload_all_basic_metadata.js ← Upload metadata to decentralized storage (TBD)
+├── 5_upload_all_pdfs.js           ← Upload PDFs to decentralized storage (TBD)
+├── 9_fund.js                      ← Funding registration or helper functions
+├── .env.example                   ← Example environment configuration
+└── README.md                      ← This file
+```
+
+---
 
-1. Clone this repository:
-   ```bash
-   git clone https://github.com/SciVault/sciuploader
-   cd sciuploader
-   ```
+## ✅ How to Use
 
-2. Install dependencies:
-   ```bash
-   npm install
-   ```
+### 1. Install dependencies
+
+```bash
+npm install
+```
 
-## Usage
+### 2. Set environment variables (optional)
 
-### Step 0: Prepare Your Data
+Copy `.env.example` to `.env` and fill in any required values (e.g., upload keys for later stages).
 
-1. Create a `metadata` folder in the project root
-2. Place your metadata JSON files and corresponding PDFs in this folder
-   - Each PDF should have a matching JSON file with the same name (e.g., `paper1.pdf` and `paper1.json`)
-   - JSON files must contain a `doi` field
-3. Run the metadata generator:
-   ```bash
-   node 0_generate_basic_metadata.js
-   ```
-   This will create a `basic_metadata.json` file containing essential paper information.
+---
 
-### Step 1: Upload Basic Metadata
+### 3. Run full workflow
 
-Upload the basic metadata (title, authors, DOI, etc.):
 ```bash
-node 1_upload_basic_metadata.js
+node 0_run_workflow.js
 ```
 
-### Step 2: Upload PDFs
+for dividing tasks,
+add --start-page=3 --end-page=4 like this, there are total 883431 pages
 
-Upload PDFs (they will be automatically split into chunks):
 ```bash
-node 2_upload_pdf.js
+node 0_run_workflow.js --start-page=3 --end-page=4
 ```
 
-Note: If uploads fail due to network issues, you can safely run the script again. It will skip already uploaded files and continue with failed ones.
 
+Or run step-by-step:
 
-## Version Control
+---
 
-The system uses semantic versioning for content management:
-- Current version: `2.0.0`
-- Format: `MAJOR.MINOR.PATCH`
-  - MAJOR: Breaking changes
-  - MINOR: New features
-  - PATCH: Bug fixes
+### ◾️ Step 1: Fetch all DOIs (optional)
 
-When uploading content, ensure you're using the correct version in the tags.
+```bash
+node 1_fetch_all_dois.js
+```
+
+This fetches DOIs from an API and saves them into `doi/page_N.json` files.
 
-## Error Handling
+---
 
-- Each upload script generates detailed logs:
-  - `upload_report.json`: Summary of upload results
-  - `upload_errors.json`: Details of failed uploads
-- Failed uploads can be retried by running the script again
-- The system checks for existing uploads to avoid duplicates
+### ◾️ Step 2: Download all PDFs
 
-## Web Interface
+```bash
+node 2_fetch_all_pdfs.js --start-page=1 --end-page=10
+```
 
-The `queryweb` folder contains a simple web interface for searching and viewing papers:
-- Search by DOI, title, or arXiv ID
-- View paper metadata
-- Download PDF files
+- Failed downloads are logged to `failed_log_page_N.txt` per page.
+- Already downloaded and valid files are skipped.
+
+---
+
+### ◾️ Step 3: Generate basic metadata
+
+```bash
+node 3_generate_basic_metadata.js
+```
+
+---
+
+### ◾️ Step 4 & 5: Upload 
+
+```bash
+node 4_upload_all_basic_metadata.js
+node 5_upload_all_pdfs.js
+```
+---
 
-## License
+## 📜 License
 
-MIT
\ No newline at end of file
+MIT 
\ No newline at end of file

From 3854483142f6c5423f7de1a5ca687470940a0cef Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Fri, 6 Jun 2025 17:14:15 +0900
Subject: [PATCH 6/8] script update now you can run node 0_run_workflow.js
 --start-page=300000 --end-page=400000 --batch-size=10

---
 0_run_workflow.js | 94 +++++++++++++++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 31 deletions(-)

diff --git a/0_run_workflow.js b/0_run_workflow.js
index 5932947..3417867 100644
--- a/0_run_workflow.js
+++ b/0_run_workflow.js
@@ -1,57 +1,89 @@
-// 0_run_workflow.js
 const { execSync } = require("child_process");
+const fs = require("fs");
+const path = require("path");
 
-// Get CLI arguments
+// === CLI Argument Parser ===
 const args = process.argv.slice(2);
 const getArg = (name) => {
   const prefix = `--${name}=`;
   const found = args.find((arg) => arg.startsWith(prefix));
   return found ? parseInt(found.slice(prefix.length), 10) : undefined;
 };
-
 const startPage = getArg("start-page");
 const endPage = getArg("end-page");
+const batchSize = getArg("batch-size") || 10;
 
 if (!startPage || !endPage || isNaN(startPage) || isNaN(endPage)) {
-  console.error("❌ Missing or invalid arguments. Usage: node 0_run_workflow.js --start-page=3 --end-page=4");
+  console.error("❌ Usage: node 0_run_workflow.js --start-page=10 --end-page=100 --batch-size=10");
   process.exit(1);
 }
 
-console.log(`🚀 Starting workflow from page ${startPage} to ${endPage}\n`);
-
-const steps = [
-  {
-    name: "📥 Step 1️⃣: Fetching DOI JSON...",
-    command: `node 1_fetch_all_dois.js --start-page=${startPage} --end-page=${endPage}`,
-  },
-  {
-    name: "📄 Step 2️⃣: Downloading PDFs...",
-    command: `node 2_fetch_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`,
-  },
-  {
-    name: "🧠 Step 3️⃣: Generating metadata...",
-    command: `node 3_generate_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`,
-  },
-  {
-    name: "🆙 Step 4️⃣: Uploading metadata to Irys...",
-    command: `node 4_upload_all_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`,
-  },
-  {
-    name: "📤 Step 5️⃣: Uploading PDFs to Irys...",
-    command: `node 5_upload_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`,
-  },
-];
+function deletePdfFolder(page) {
+  const dirPath = path.join("pdf", `page_${page}`);
+  if (fs.existsSync(dirPath)) {
+    const files = fs.readdirSync(dirPath);
+    for (const file of files) {
+      if (file.endsWith(".pdf")) {
+        fs.unlinkSync(path.join(dirPath, file));
+      }
+    }
+    console.log(`🧹 Deleted PDF files in folder: ${dirPath}`);
+  }
+}
+
+async function runWorkflowBatch(batchStart, batchEnd) {
+  console.log(`\n🚀 Starting workflow for pages ${batchStart} - ${batchEnd}\n`);
+  const steps = [
+    {
+      name: "📥 Step 1️⃣: Fetching DOI JSON...",
+      command: `node 1_fetch_all_dois.js --start-page=${batchStart} --end-page=${batchEnd}`,
+    },
+    {
+      name: "📄 Step 2️⃣: Downloading PDFs...",
+      command: `node 2_fetch_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`,
+    },
+    {
+      name: "🧠 Step 3️⃣: Generating metadata...",
+      command: `node 3_generate_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`,
+    },
+    {
+      name: "🆙 Step 4️⃣: Uploading metadata to Irys...",
+      command: `node 4_upload_all_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`,
+    },
+    {
+      name: "📤 Step 5️⃣: Uploading PDFs to Irys...",
+      command: `node 5_upload_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`,
+    },
+  ];
 
-(async () => {
   for (const step of steps) {
     console.log(`\n${step.name}`);
     try {
       execSync(step.command, { stdio: "inherit" });
     } catch (err) {
-      console.error(`❌ Workflow failed: ${err.message}`);
+      console.error(`❌ Step failed: ${err.message}`);
+      return false;
+    }
+  }
+
+  // cleanup pdf files in each page folder
+  for (let page = batchStart; page <= batchEnd; page++) {
+    deletePdfFolder(page);
+  }
+
+  return true;
+}
+
+(async () => {
+  for (let i = startPage; i <= endPage; i += batchSize) {
+    const batchStart = i;
+    const batchEnd = Math.min(endPage, i + batchSize - 1);
+    const success = await runWorkflowBatch(batchStart, batchEnd);
+    if (!success) {
+      console.error(`❌ Stopping workflow due to error in batch ${batchStart}-${batchEnd}`);
       process.exit(1);
     }
   }
 
-  console.log("\n✅ All steps completed successfully!");
+  console.log("\n✅ All batches completed successfully!");
 })();

From 65d2a34eccf218f261fa4f8d95b4adfb48e18359 Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Fri, 6 Jun 2025 17:17:28 +0900
Subject: [PATCH 7/8] script update now you can run node 0_run_workflow.js
 --start-page=300000 --end-page=400000 --batch-size=10

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2d851b9..23a2701 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ for dividing tasks,
 add --start-page=3 --end-page=4 like this, there are total 883431 pages
 
 ```bash
-node 0_run_workflow.js --start-page=3 --end-page=4
+node 0_run_workflow.js --start-page=300000 --end-page=400000 --batch-size=10
 ```
 
 

From 24aa52c6fac2c758779defc7fdb9a743fbdf7bbe Mon Sep 17 00:00:00 2001
From: heinzhex <B1Z9IC@protonmail.com>
Date: Fri, 6 Jun 2025 17:20:58 +0900
Subject: [PATCH 8/8] script update now you can run node 0_run_workflow.js
 --start-page=300000 --end-page=400000 --batch-size=10

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 23a2701..5677cf8 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # 📄 SciUploader – Bulk Sci-Hub PDF Downloader
 
-This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage (e.g., Arweave/Irys).
+This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage Irys.
 
 ---