From 4792125776cbc6b68bbd01fdc7b96c9c8617f196 Mon Sep 17 00:00:00 2001 From: heinzhex Date: Thu, 29 May 2025 02:25:22 +0900 Subject: [PATCH 1/8] feat: update full PDF upload, add fund.js, upgrade metadata tags to v2.0.0 --- .gitignore | Bin 466 -> 506 bytes 1_upload_basic_metadata.js | 43 ++++++----- 2_upload_pdf.js | 142 ++++++++++++------------------------- fund.js | 51 +++++++++++++ package.json | 13 +++- 5 files changed, 129 insertions(+), 120 deletions(-) create mode 100644 fund.js diff --git a/.gitignore b/.gitignore index 9bbd1f23556c0f07691ddf27e495da3c9a2c4cfe..0cd1ee7882957fdfd807d9ee0b28803724029023 100644 GIT binary patch delta 15 Wcmcb_{EK-*0ORB`M!m^lj5Po-papUO delta 11 Scmeyxe2IBO0ORBe#wGwAP6T2A diff --git a/1_upload_basic_metadata.js b/1_upload_basic_metadata.js index caf1a99..d0c70bd 100644 --- a/1_upload_basic_metadata.js +++ b/1_upload_basic_metadata.js @@ -4,17 +4,19 @@ const { Solana } = require("@irys/upload-solana"); const fs = require("fs").promises; const path = require("path"); +// 初始化上传器 const getIrysUploader = async () => { try { const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("Irys uploader initialized."); + console.log("✅ Irys uploader initialized."); return irysUploader; } catch (error) { - console.error("Failed to initialize Irys uploader:", error); + console.error("❌ Failed to initialize Irys uploader:", error); return null; } }; +// 上传 basic_metadata.json 中的数据 const uploadBasicMetadata = async () => { const irys = await getIrysUploader(); if (!irys) { @@ -23,14 +25,13 @@ const uploadBasicMetadata = async () => { } try { - // Read the basic_metadata.json file const filePath = path.join(process.cwd(), 'basic_metadata.json'); - console.log(`Reading file: ${filePath}`); + console.log(`📄 Reading file: ${filePath}`); const content = await fs.readFile(filePath, 'utf8'); const papers = JSON.parse(content); - console.log(`Loaded ${papers.length} papers for processing`); + console.log(`📚 Loaded ${papers.length} papers for processing`); let successCount = 0; let failCount = 0; @@ -47,12 +48,12 @@ const uploadBasicMetadata = async () => { try { const normalizedDoi = paper.doi.trim(); - const normalizedTitle = paper.title - .replace(/\s+/g, ' ') // Replace multiple spaces with single space - .replace(/\n/g, '') // Remove newlines - .trim(); // Remove leading/trailing spaces + const normalizedTitle = (paper.title || "") + .replace(/\s+/g, ' ') + .replace(/\n/g, '') + .trim(); - const normalizedAuthors = paper.authors + const normalizedAuthors = (paper.authors || "") .replace(/\s+/g, ' ') .replace(/\n/g, '') .trim(); @@ -60,17 +61,17 @@ const uploadBasicMetadata = async () => { const tags = [ { name: "App-Name", value: "scivault" }, { name: "Content-Type", value: "application/json" }, - { name: "Version", value: "1.0.3" }, + { name: "Version", value: "2.0.0" }, { name: "doi", value: normalizedDoi }, { name: "title", value: normalizedTitle }, { name: "authors", value: normalizedAuthors }, - { name: "aid", value: paper.aid } + { name: "aid", value: paper.aid || "" } ]; const paperMetadata = Buffer.from(JSON.stringify(paper)); const receipt = await irys.upload(paperMetadata, { tags }); - console.log(`✅ Uploaded: ${paper.doi} (${receipt.id})`); + console.log(`✅ Uploaded: ${normalizedDoi} (${receipt.id})`); successCount++; } catch (error) { @@ -78,25 +79,23 @@ const uploadBasicMetadata = async () => { failCount++; } - // Progress report every 10 papers if ((i + 1) % 10 === 0 || i === papers.length - 1) { console.log(`\n📊 Progress Report:`); - console.log(` Success: ${successCount}`); - console.log(` Failed: ${failCount}`); - console.log(` Progress: ${Math.round((i + 1) / papers.length * 100)}%`); + console.log(` ✅ Success: ${successCount}`); + console.log(` ❌ Failed: ${failCount}`); + console.log(` 🔄 Progress: ${Math.round((i + 1) / papers.length * 100)}%`); } } console.log(`\n✨ Upload Complete`); - console.log(` Final Results:`); - console.log(` Total Success: ${successCount}`); - console.log(` Total Failed: ${failCount}`); - console.log(` Success Rate: ${Math.round(successCount / papers.length * 100)}%`); + console.log(` ✅ Total Success: ${successCount}`); + console.log(` ❌ Total Failed: ${failCount}`); + console.log(` 📈 Success Rate: ${Math.round(successCount / papers.length * 100)}%`); } catch (error) { console.error("❌ Error uploading metadata:", error); } }; -// Run the upload process +// 执行上传 uploadBasicMetadata().catch(console.error); diff --git a/2_upload_pdf.js b/2_upload_pdf.js index d468847..e6de4bf 100644 --- a/2_upload_pdf.js +++ b/2_upload_pdf.js @@ -1,23 +1,22 @@ require("dotenv").config(); const { Uploader } = require("@irys/upload"); const { Solana } = require("@irys/upload-solana"); -const { PDFDocument } = require("pdf-lib"); const fs = require("fs").promises; const path = require("path"); -const MAX_SLICE_SIZE = 50 * 1024; // 50KB per slice - +// 初始化 Irys 上传器 const getIrysUploader = async () => { try { const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("Irys uploader initialized."); + console.log("✅ Irys uploader initialized."); return irysUploader; } catch (error) { - console.error("Failed to initialize Irys uploader:", error); + console.error("❌ Failed to initialize Irys uploader:", error); return null; } }; +// 遍历目录查找 PDF 文件 async function walkDir(dir) { try { const files = await fs.readdir(dir); @@ -29,64 +28,40 @@ async function walkDir(dir) { } } +// 读取 JSON 元数据文件中的 DOI async function getDoiFromMetadata(pdfPath) { try { - // Get the corresponding JSON file path by replacing .pdf with .json const jsonPath = pdfPath.replace('.pdf', '.json'); - - console.log(`Looking for metadata file: ${jsonPath}`); - - // Read and parse the JSON file + console.log(`🔍 Looking for metadata file: ${jsonPath}`); const jsonData = await fs.readFile(jsonPath, 'utf8'); const metadata = JSON.parse(jsonData); - - if (!metadata.doi) { - throw new Error(`No DOI found in metadata file: ${jsonPath}`); - } - - console.log(`Found DOI: ${metadata.doi}`); + if (!metadata.doi) throw new Error(`No DOI found in metadata file: ${jsonPath}`); + console.log(`✅ Found DOI: ${metadata.doi}`); return metadata.doi; } catch (error) { - console.error(`Error getting DOI from metadata:`, error); + console.error(`❌ Error getting DOI from metadata:`, error); throw error; } } -const sliceAndUploadPdf = async (inputPath, doi) => { +// 上传单个 PDF(不再切片) +const uploadPdf = async (inputPath, doi) => { try { console.log(`\n📄 Processing PDF: ${path.basename(inputPath)}`); - - // Read and validate PDF - const pdfBytes = await fs.readFile(inputPath); - const pdfDoc = await PDFDocument.load(pdfBytes); - const fileBase64 = await pdfDoc.saveAsBase64(); - - // Create chunks - const chunks = []; - for (let i = 0; i < fileBase64.length; i += MAX_SLICE_SIZE) { - const chunk = fileBase64.slice(i, i + MAX_SLICE_SIZE); - chunks.push(chunk); - } - - console.log(`File size: ${fileBase64.length} bytes`); - console.log(`Total chunks: ${chunks.length}`); - // Check if PDF was already uploaded + // 1. 检查是否已上传过 const query = ` query { transactions( tags: [ + { name: "App-Name", values: ["scivault"] }, { name: "Content-Type", values: ["application/pdf"] }, - { name: "application", values: ["scivault"] }, - { name: "Version", values: ["1.0.3"] }, - { name: "Type", values: ["pdf-index"] }, - { name: "Collection", values: ["${doi}"] } + { name: "Version", values: ["2.0.0"] }, + { name: "doi", values: ["${doi}"] } ] ) { edges { - node { - id - } + node { id } } } } @@ -100,56 +75,42 @@ const sliceAndUploadPdf = async (inputPath, doi) => { const result = await response.json(); if (result.data?.transactions?.edges?.[0]?.node?.id) { - console.log(`⚠️ PDF already exists for DOI: ${doi}`); + console.log(`⚠️ PDF already uploaded for DOI: ${doi}`); return result.data.transactions.edges.map(edge => edge.node.id); } - // Upload chunks + // 2. 上传 PDF const irys = await getIrysUploader(); - if (!irys) { - throw new Error("Failed to initialize Irys uploader"); - } + if (!irys) throw new Error("Failed to initialize Irys uploader"); - const receiptIDs = []; + const buffer = await fs.readFile(inputPath); const tags = [ + { name: "App-Name", value: "scivault" }, { name: "Content-Type", value: "application/pdf" }, - { name: "application", value: "scivault" }, - { name: "Version", value: "1.0.3" }, - { name: "Type", value: "pdf-index" }, - { name: "Collection", value: doi } + { name: "Version", value: "2.0.0" }, + { name: "doi", value: doi } ]; - for (let i = 0; i < chunks.length; i++) { - console.log(`\nUploading chunk ${i + 1}/${chunks.length}...`); - const receipt = await irys.upload(Buffer.from(chunks[i]), { tags }); - receiptIDs.push(receipt.id); - console.log(`✅ Chunk uploaded: ${receipt.id}`); - } - - console.log(`\n✨ PDF uploaded successfully!`); - console.log(`Receipt IDs: ${receiptIDs.join(", ")}`); - return receiptIDs; + const receipt = await irys.upload(buffer, { tags }); + console.log(`✅ PDF uploaded successfully. Transaction ID: ${receipt.id}`); + return [receipt.id]; } catch (error) { - console.error(`❌ Error processing PDF: ${error.message}`); + console.error(`❌ Error uploading PDF: ${error.message}`); throw error; } }; -// 添加错误日志功能 +// 错误记录 async function logError(filePath, error, doi = null) { const errorLogPath = path.join(process.cwd(), 'upload_errors.json'); try { - // 读取现有的错误日志,如果不存在则创建新的 let errorLog = []; try { const existingLog = await fs.readFile(errorLogPath, 'utf8'); errorLog = JSON.parse(existingLog); - } catch (e) { - // 文件不存在,使用空数组 - } + } catch {} - // 添加新的错误记录 errorLog.push({ timestamp: new Date().toISOString(), file: filePath, @@ -158,18 +119,18 @@ async function logError(filePath, error, doi = null) { stack: error.stack }); - // 保存更新后的错误日志 await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2)); - console.log(`Error logged to ${errorLogPath}`); + console.log(`📝 Error logged to ${errorLogPath}`); } catch (logError) { - console.error('Failed to log error:', logError); + console.error('❌ Failed to log error:', logError); } } +// 批量上传 PDF 主函数 const uploadPdfs = async (pdfDir) => { try { const files = await walkDir(pdfDir); - console.log(`Found ${files.length} PDF files in ${pdfDir}`); + console.log(`\n📁 Found ${files.length} PDF files in ${pdfDir}`); let successCount = 0; let failCount = 0; @@ -179,34 +140,24 @@ const uploadPdfs = async (pdfDir) => { const pdfFile = files[i]; let doi = null; try { - // 获取 DOI doi = await getDoiFromMetadata(pdfFile); - console.log(`\nProcessing PDF: ${path.basename(pdfFile)}`); - console.log(`Using DOI: ${doi}`); - - // 尝试上传 - await sliceAndUploadPdf(pdfFile, doi); + await uploadPdf(pdfFile, doi); successCount++; } catch (error) { failCount++; await logError(pdfFile, error, doi); - errorFiles.push({ - file: pdfFile, - doi: doi, - error: error.message - }); + errorFiles.push({ file: pdfFile, doi: doi, error: error.message }); } - // Progress report if ((i + 1) % 5 === 0 || i === files.length - 1) { console.log(`\n📊 Progress Report:`); - console.log(` Success: ${successCount}`); - console.log(` Failed: ${failCount}`); - console.log(` Progress: ${Math.round((i + 1) / files.length * 100)}%`); + console.log(` ✅ Success: ${successCount}`); + console.log(` ❌ Failed: ${failCount}`); + console.log(` 🔄 Progress: ${Math.round((i + 1) / files.length * 100)}%`); } } - // 在完成时生成详细报告 + // 写入报告 const report = { timestamp: new Date().toISOString(), totalFiles: files.length, @@ -216,17 +167,15 @@ const uploadPdfs = async (pdfDir) => { failedFiles: errorFiles }; - // 保存报告 const reportPath = path.join(process.cwd(), 'upload_report.json'); await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); console.log(`\n🎉 Upload Complete`); - console.log(` Total Success: ${successCount}`); - console.log(` Total Failed: ${failCount}`); - console.log(` Success Rate: ${Math.round(successCount / files.length * 100)}%`); - console.log(` Detailed report saved to: ${reportPath}`); + console.log(` ✅ Total Success: ${successCount}`); + console.log(` ❌ Total Failed: ${failCount}`); + console.log(` 📄 Report saved to: ${reportPath}`); if (failCount > 0) { - console.log(` Error log saved to: upload_errors.json`); + console.log(` 📌 Error log saved to: upload_errors.json`); } } catch (error) { @@ -235,14 +184,15 @@ const uploadPdfs = async (pdfDir) => { } }; -// If running directly +// CLI 执行入口 if (require.main === module) { const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); uploadPdfs(metadataDir).catch(console.error); } +// 可导出函数供其他模块调用 module.exports = { getIrysUploader, - sliceAndUploadPdf, + uploadPdf, uploadPdfs }; diff --git a/fund.js b/fund.js new file mode 100644 index 0000000..2efad66 --- /dev/null +++ b/fund.js @@ -0,0 +1,51 @@ +require("dotenv").config(); +const readline = require("readline"); +const { Uploader } = require("@irys/upload"); +const { Solana } = require("@irys/upload-solana"); + +const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout +}); + +const askUser = (question) => { + return new Promise((resolve) => { + rl.question(question, (answer) => { + resolve(answer.trim().toLowerCase()); + }); + }); +}; + +const main = async () => { + try { + const irys = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); + + const address = irys.address; + const token = irys.token; + + const atomicBalance = await irys.getLoadedBalance(); + const balance = irys.utils.fromAtomic(atomicBalance); + + console.log(`\n🌐 Public Address: ${address}`); + console.log(`💰 Current Irys Balance: ${balance} ${token}`); + console.log(`🔗 Check wallet on Solana Explorer: https://explorer.solana.com/address/${address}?cluster=mainnet`); + + const answer = await askUser("\n🪙 Do you want to fund 0.01 SOL to Irys? (yes/no): "); + + if (answer === "yes" || answer === "y") { + const amount = "0.01"; + console.log(`\n⛽ Funding ${amount} SOL to Irys...`); + + const fundResult = await irys.fund(irys.utils.toAtomic(amount)); + console.log(`✅ Fund successful! Transaction ID: ${fundResult.id}`); + } else { + console.log("ℹ️ Funding skipped."); + } + } catch (err) { + console.error("❌ Failed to get balance or fund Irys:", err); + } finally { + rl.close(); + } +}; + +main(); diff --git a/package.json b/package.json index 06f3ca5..a2fb714 100644 --- a/package.json +++ b/package.json @@ -28,5 +28,14 @@ "decentralized" ], "author": "SciVault", - "license": "MIT" -} \ No newline at end of file + "license": "MIT", + "devDependencies": {}, + "repository": { + "type": "git", + "url": "git+https://github.com/Scihub-Community/sciuploader.git" + }, + "bugs": { + "url": "https://github.com/Scihub-Community/sciuploader/issues" + }, + "homepage": "https://github.com/Scihub-Community/sciuploader#readme" +} From d948daecfbcba44507b94e3cc4b670c24706618e Mon Sep 17 00:00:00 2001 From: heinzhex Date: Thu, 29 May 2025 02:46:40 +0900 Subject: [PATCH 2/8] docs: update README with setup and usage instructions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c626acd..dc773cd 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ A decentralized academic paper repository system built on Arweave/Irys. 1. Clone this repository: ```bash - git clone https://github.com/yourusername/scivault.git + git clone https://github.com/SciVault/sciuploader cd sciuploader ``` From b326bfbcd0b8a5f5b390604d9dc2de92c0690a81 Mon Sep 17 00:00:00 2001 From: heinzhex Date: Thu, 29 May 2025 03:00:22 +0900 Subject: [PATCH 3/8] docs: update README with setup and usage instructions --- README.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/README.md b/README.md index dc773cd..1715ff0 100644 --- a/README.md +++ b/README.md @@ -54,17 +54,11 @@ node 2_upload_pdf.js Note: If uploads fail due to network issues, you can safely run the script again. It will skip already uploaded files and continue with failed ones. -### Step 3: Upload Complete Metadata - -Upload the complete metadata with all paper details: -```bash -node 3_upload_all_metadata.js -``` ## Version Control The system uses semantic versioning for content management: -- Current version: `1.0.3` +- Current version: `2.0.0` - Format: `MAJOR.MINOR.PATCH` - MAJOR: Breaking changes - MINOR: New features From 5cdb7f24aa19c84dcbe7307500e7353b289e047e Mon Sep 17 00:00:00 2001 From: heinzhex Date: Tue, 3 Jun 2025 19:36:19 +0900 Subject: [PATCH 4/8] Refactor: restructure script flow and replace old upload logic --- 0_generate_basic_metadata.js | 100 ----------------- 0_run_workflow.js | 57 ++++++++++ 1_fetch_all_dois.js | 88 +++++++++++++++ 1_upload_basic_metadata.js | 101 ----------------- 2_fetch_all_pdfs.js | 165 +++++++++++++++++++++++++++ 2_upload_pdf.js | 198 -------------------------------- 3_generate_basic_metadata.js | 99 ++++++++++++++++ 3_upload_all_metadata.js | 200 --------------------------------- 4_upload_all_basic_metadata.js | 131 +++++++++++++++++++++ 5_upload_all_pdfs.js | 168 +++++++++++++++++++++++++++ fund.js => 9_fund.js | 0 package.json | 7 +- 12 files changed, 713 insertions(+), 601 deletions(-) delete mode 100644 0_generate_basic_metadata.js create mode 100644 0_run_workflow.js create mode 100644 1_fetch_all_dois.js delete mode 100644 1_upload_basic_metadata.js create mode 100644 2_fetch_all_pdfs.js delete mode 100644 2_upload_pdf.js create mode 100644 3_generate_basic_metadata.js delete mode 100644 3_upload_all_metadata.js create mode 100644 4_upload_all_basic_metadata.js create mode 100644 5_upload_all_pdfs.js rename fund.js => 9_fund.js (100%) diff --git a/0_generate_basic_metadata.js b/0_generate_basic_metadata.js deleted file mode 100644 index 9e4804f..0000000 --- a/0_generate_basic_metadata.js +++ /dev/null @@ -1,100 +0,0 @@ -const fs = require('fs').promises; -const path = require('path'); - -async function walkDir(dir) { - try { - const files = await fs.readdir(dir); - const jsonFiles = files.filter(file => file.endsWith('.json')); - return jsonFiles.map(file => path.join(dir, file)); - } catch (error) { - console.error('Error reading directory:', error); - throw error; - } -} - -function extractAbstract(paper) { - // Try to reconstruct abstract from inverted index if available - if (paper.openalex?.abstract_inverted_index) { - const words = []; - const index = paper.openalex.abstract_inverted_index; - const maxPosition = Math.max(...Object.values(index).flat()); - - for (let i = 0; i <= maxPosition; i++) { - for (const [word, positions] of Object.entries(index)) { - if (positions.includes(i)) { - words[i] = word; - break; - } - } - } - return words.join(' '); - } - return ""; // Return empty string if no abstract found -} - -function extractBasicMetadata(paper) { - return { - abstract: extractAbstract(paper), - title: paper.openalex?.title || - paper.crossref?.title?.[0] || - "", - authors: paper.openalex?.authorships - ?.map(a => a.raw_author_name) - .join(", ") || - paper.crossref?.author - ?.map(a => `${a.given} ${a.family}`) - .join(", ") || - "", - doi: paper.doi || "", - aid: paper.openalex?.id?.split("/").pop() || - paper.crossref?.DOI?.replace(/[^a-zA-Z0-9]/g, "") || - "" - }; -} - -async function generateBasicMetadata(metadataDir) { - try { - // Get all JSON files in the directory - const files = await walkDir(metadataDir); - - // Process each file - const metadata = []; - for (const file of files) { - try { - console.log(`Processing file: ${file}`); // Add logging - const content = await fs.readFile(file, 'utf8'); - const paper = JSON.parse(content.trim()); // Add trim() to remove any BOM or whitespace - - const basicMetadata = extractBasicMetadata(paper); - metadata.push(basicMetadata); - } catch (error) { - console.error(`Error processing file ${file}:`, error); - // Continue with next file instead of stopping - continue; - } - } - - // Write the results to a file - const outputPath = path.join(process.cwd(), 'basic_metadata.json'); - await fs.writeFile( - outputPath, - JSON.stringify(metadata, null, 2) - ); - - console.log(`Basic metadata generated and saved to ${outputPath}`); - console.log(`Processed ${metadata.length} files successfully`); - return metadata; - } catch (error) { - console.error('Error generating basic metadata:', error); - throw error; - } -} - -// Export the function if using as a module -module.exports = generateBasicMetadata; - -// If running directly -if (require.main === module) { - const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); - generateBasicMetadata(metadataDir).catch(console.error); -} diff --git a/0_run_workflow.js b/0_run_workflow.js new file mode 100644 index 0000000..5932947 --- /dev/null +++ b/0_run_workflow.js @@ -0,0 +1,57 @@ +// 0_run_workflow.js +const { execSync } = require("child_process"); + +// Get CLI arguments +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find((arg) => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; + +const startPage = getArg("start-page"); +const endPage = getArg("end-page"); + +if (!startPage || !endPage || isNaN(startPage) || isNaN(endPage)) { + console.error("❌ Missing or invalid arguments. Usage: node 0_run_workflow.js --start-page=3 --end-page=4"); + process.exit(1); +} + +console.log(`🚀 Starting workflow from page ${startPage} to ${endPage}\n`); + +const steps = [ + { + name: "📥 Step 1️⃣: Fetching DOI JSON...", + command: `node 1_fetch_all_dois.js --start-page=${startPage} --end-page=${endPage}`, + }, + { + name: "📄 Step 2️⃣: Downloading PDFs...", + command: `node 2_fetch_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`, + }, + { + name: "🧠 Step 3️⃣: Generating metadata...", + command: `node 3_generate_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`, + }, + { + name: "🆙 Step 4️⃣: Uploading metadata to Irys...", + command: `node 4_upload_all_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`, + }, + { + name: "📤 Step 5️⃣: Uploading PDFs to Irys...", + command: `node 5_upload_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`, + }, +]; + +(async () => { + for (const step of steps) { + console.log(`\n${step.name}`); + try { + execSync(step.command, { stdio: "inherit" }); + } catch (err) { + console.error(`❌ Workflow failed: ${err.message}`); + process.exit(1); + } + } + + console.log("\n✅ All steps completed successfully!"); +})(); diff --git a/1_fetch_all_dois.js b/1_fetch_all_dois.js new file mode 100644 index 0000000..6b60a8f --- /dev/null +++ b/1_fetch_all_dois.js @@ -0,0 +1,88 @@ +const fs = require('fs'); +const axios = require('axios'); +const path = require('path'); + +const OUTPUT_DIR = path.join(__dirname, 'doi'); +const BASE_URL = 'https://api.scai.sh/dois?page='; +const TOTAL_PAGES = 883431; +const DELAY_MS = 2000; + +// Parse CLI arguments: --start-page=XX --end-page=XX +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; + +const cliStartPage = getArg('start-page'); +const cliEndPage = getArg('end-page'); + +// Ensure output directory exists +if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR); +} + +// Utility: Delay between requests +const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + +// Get last downloaded page number from existing files +function getLastDownloadedPage() { + const files = fs.readdirSync(OUTPUT_DIR); + const pageNumbers = files + .map(file => { + const match = file.match(/page_(\d+)\.json$/); + return match ? parseInt(match[1], 10) : null; + }) + .filter(n => n !== null) + .sort((a, b) => a - b); + return pageNumbers.length ? pageNumbers[pageNumbers.length - 1] : 0; +} + +// Download a range of pages +async function downloadAllPages(startPage, endPage) { + for (let page = startPage; page <= endPage; page++) { + const filePath = path.join(OUTPUT_DIR, `page_${page}.json`); + if (fs.existsSync(filePath)) { + console.log(`✅ Page ${page} already exists. Skipping.`); + continue; + } + + const url = `${BASE_URL}${page}`; + try { + console.log(`🔍 Fetching page ${page}...`); + const res = await axios.get(url); + const data = res.data; + + if (data && Array.isArray(data.dois)) { + fs.writeFileSync(filePath, JSON.stringify(data.dois, null, 2)); + console.log(`✅ Page ${page} saved (${data.dois.length} DOIs)`); + } else { + console.warn(`⚠️ Page ${page} response missing 'dois' array. Skipping.`); + } + } catch (err) { + console.error(`❌ Failed to fetch page ${page}: ${err.message}`); + console.log('🛑 Stopping script. You can rerun it to resume.'); + break; + } + + await sleep(DELAY_MS); + } + + console.log('🎉 Finished fetching pages.'); +} + +// Entry point +async function main() { + if (cliStartPage !== undefined && cliEndPage !== undefined) { + console.log(`🚀 Running in range mode: page ${cliStartPage} → ${cliEndPage}`); + await downloadAllPages(cliStartPage, cliEndPage); + } else { + const start = getLastDownloadedPage() + 1; + const end = TOTAL_PAGES; + console.log(`🔁 Resuming from page ${start} → ${end}`); + await downloadAllPages(start, end); + } +} + +main(); diff --git a/1_upload_basic_metadata.js b/1_upload_basic_metadata.js deleted file mode 100644 index d0c70bd..0000000 --- a/1_upload_basic_metadata.js +++ /dev/null @@ -1,101 +0,0 @@ -require("dotenv").config(); -const { Uploader } = require("@irys/upload"); -const { Solana } = require("@irys/upload-solana"); -const fs = require("fs").promises; -const path = require("path"); - -// 初始化上传器 -const getIrysUploader = async () => { - try { - const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("✅ Irys uploader initialized."); - return irysUploader; - } catch (error) { - console.error("❌ Failed to initialize Irys uploader:", error); - return null; - } -}; - -// 上传 basic_metadata.json 中的数据 -const uploadBasicMetadata = async () => { - const irys = await getIrysUploader(); - if (!irys) { - console.error("Irys uploader could not be initialized."); - return; - } - - try { - const filePath = path.join(process.cwd(), 'basic_metadata.json'); - console.log(`📄 Reading file: ${filePath}`); - - const content = await fs.readFile(filePath, 'utf8'); - const papers = JSON.parse(content); - - console.log(`📚 Loaded ${papers.length} papers for processing`); - - let successCount = 0; - let failCount = 0; - - for (let i = 0; i < papers.length; i++) { - const paper = papers[i]; - console.log(`\n📄 Processing paper [${i + 1}/${papers.length}]`); - - if (!paper.doi) { - console.log(`⚠️ Skipping paper: No DOI found`); - failCount++; - continue; - } - - try { - const normalizedDoi = paper.doi.trim(); - const normalizedTitle = (paper.title || "") - .replace(/\s+/g, ' ') - .replace(/\n/g, '') - .trim(); - - const normalizedAuthors = (paper.authors || "") - .replace(/\s+/g, ' ') - .replace(/\n/g, '') - .trim(); - - const tags = [ - { name: "App-Name", value: "scivault" }, - { name: "Content-Type", value: "application/json" }, - { name: "Version", value: "2.0.0" }, - { name: "doi", value: normalizedDoi }, - { name: "title", value: normalizedTitle }, - { name: "authors", value: normalizedAuthors }, - { name: "aid", value: paper.aid || "" } - ]; - - const paperMetadata = Buffer.from(JSON.stringify(paper)); - const receipt = await irys.upload(paperMetadata, { tags }); - - console.log(`✅ Uploaded: ${normalizedDoi} (${receipt.id})`); - successCount++; - - } catch (error) { - console.error(`❌ Failed: ${paper.doi} - ${error.message}`); - failCount++; - } - - if ((i + 1) % 10 === 0 || i === papers.length - 1) { - console.log(`\n📊 Progress Report:`); - console.log(` ✅ Success: ${successCount}`); - console.log(` ❌ Failed: ${failCount}`); - console.log(` 🔄 Progress: ${Math.round((i + 1) / papers.length * 100)}%`); - } - } - - console.log(`\n✨ Upload Complete`); - console.log(` ✅ Total Success: ${successCount}`); - console.log(` ❌ Total Failed: ${failCount}`); - console.log(` 📈 Success Rate: ${Math.round(successCount / papers.length * 100)}%`); - - } catch (error) { - console.error("❌ Error uploading metadata:", error); - } -}; - -// 执行上传 -uploadBasicMetadata().catch(console.error); diff --git a/2_fetch_all_pdfs.js b/2_fetch_all_pdfs.js new file mode 100644 index 0000000..d05bd18 --- /dev/null +++ b/2_fetch_all_pdfs.js @@ -0,0 +1,165 @@ +const fs = require('fs'); +const path = require('path'); +const https = require('https'); +const axios = require('axios'); + +// === Configuration === +const DOI_DIR = './doi'; +const PDF_DIR = './pdf'; +const SCI_HUB_MIRRORS = [ + 'https://sci-hub.st/', + 'https://sci-hub.se/', + 'https://sci-hub.ru/', + 'https://www.tesble.com/', +]; +const DELAY_MS = 3000; +const MIN_VALID_SIZE = 1024; + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Utility Functions === +const sleep = ms => new Promise(resolve => setTimeout(resolve, ms)); +function ensureDir(dirPath) { + if (!fs.existsSync(dirPath)) fs.mkdirSync(dirPath, { recursive: true }); +} + +async function downloadPdfFromUrl(url, filePath) { + try { + const writer = fs.createWriteStream(filePath); + const response = await axios({ + url, + method: 'GET', + responseType: 'stream', + httpsAgent: new https.Agent({ rejectUnauthorized: false }) + }); + + response.data.pipe(writer); + return new Promise((resolve, reject) => { + writer.on('finish', () => { + const stats = fs.statSync(filePath); + if (stats.size >= MIN_VALID_SIZE) { + console.log(`✅ Downloaded: ${url}`); + resolve(true); + } else { + fs.unlinkSync(filePath); + console.warn(`❌ Download too small: ${url}`); + resolve(false); + } + }); + writer.on('error', reject); + }); + } catch (err) { + console.error(`❌ Download failed: ${url}`, err.message); + return false; + } +} + +async function extractPdfLinkAndDownload(doi, mirror, outputPath) { + try { + const url = mirror + encodeURIComponent(doi); + const response = await axios.get(url, { httpsAgent: new https.Agent({ rejectUnauthorized: false }) }); + const html = response.data; + + const embedMatch = html.match(/]*src=["']([^"']+\.pdf[^"']*)["']/i); + if (!embedMatch || !embedMatch[1]) { + console.warn(`❌ No PDF embed found for ${doi}`); + return false; + } + + let pdfUrl = embedMatch[1]; + if (pdfUrl.startsWith('//')) { + pdfUrl = 'https:' + pdfUrl; + } else if (!pdfUrl.startsWith('http')) { + pdfUrl = mirror + (pdfUrl.startsWith('/') ? pdfUrl.slice(1) : pdfUrl); + } + + return await downloadPdfFromUrl(pdfUrl, outputPath); + } catch (err) { + console.warn(`❌ Error scraping ${mirror} for ${doi}: ${err.message}`); + return false; + } +} + +async function tryAllMirrors(doi, outputPath) { + for (const mirror of SCI_HUB_MIRRORS) { + const success = await extractPdfLinkAndDownload(doi, mirror, outputPath); + if (success) return true; + await sleep(1000); + } + return false; +} + +async function processPage(pageFile) { + const pageNum = pageFile.match(/\d+/)[0]; + const doiPath = path.join(DOI_DIR, pageFile); + const outDir = path.join(PDF_DIR, `page_${pageNum}`); + ensureDir(outDir); + + const failedLogPath = path.join(outDir, `failed_log_page_${pageNum}.txt`); + let failedDois = new Set(); + if (fs.existsSync(failedLogPath)) { + failedDois = new Set(fs.readFileSync(failedLogPath, 'utf8').split('\n').filter(Boolean)); + } + + const dois = JSON.parse(fs.readFileSync(doiPath, 'utf8')); + + for (const doi of dois) { + const doiSafe = encodeURIComponent(doi); + const pdfPath = path.join(outDir, `${doiSafe}.pdf`); + + if (fs.existsSync(pdfPath)) { + const stats = fs.statSync(pdfPath); + if (stats.size >= MIN_VALID_SIZE) { + console.log(`✅ Already exists: ${pdfPath}`); + continue; + } else { + console.warn(`⚠️ Removing invalid file: ${pdfPath}`); + fs.unlinkSync(pdfPath); + } + } + + if (failedDois.has(doi)) { + console.log(`⚠️ Previously failed: ${doi}, skipping`); + continue; + } + + console.log(`📄 Downloading DOI: ${doi}`); + const success = await tryAllMirrors(doi, pdfPath); + if (!success) { + fs.appendFileSync(failedLogPath, `${doi}\n`); + console.error(`❌ Failed to download ${doi}`); + } + + await sleep(DELAY_MS); + } +} + +async function main() { + ensureDir(PDF_DIR); + + const pageFiles = fs.readdirSync(DOI_DIR) + .filter(f => f.startsWith('page_') && f.endsWith('.json')) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])); + + const filtered = pageFiles.filter(f => { + const page = parseInt(f.match(/\d+/)[0], 10); + return (!cliStart || page >= cliStart) && (!cliEnd || page <= cliEnd); + }); + + for (const file of filtered) { + console.log(`\n=== Processing ${file} ===`); + await processPage(file); + } + + console.log('\n🎉 All requested PDF downloads finished.'); +} + +main(); diff --git a/2_upload_pdf.js b/2_upload_pdf.js deleted file mode 100644 index e6de4bf..0000000 --- a/2_upload_pdf.js +++ /dev/null @@ -1,198 +0,0 @@ -require("dotenv").config(); -const { Uploader } = require("@irys/upload"); -const { Solana } = require("@irys/upload-solana"); -const fs = require("fs").promises; -const path = require("path"); - -// 初始化 Irys 上传器 -const getIrysUploader = async () => { - try { - const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("✅ Irys uploader initialized."); - return irysUploader; - } catch (error) { - console.error("❌ Failed to initialize Irys uploader:", error); - return null; - } -}; - -// 遍历目录查找 PDF 文件 -async function walkDir(dir) { - try { - const files = await fs.readdir(dir); - const pdfFiles = files.filter(file => file.toLowerCase().endsWith('.pdf')); - return pdfFiles.map(file => path.join(dir, file)); - } catch (error) { - console.error('Error reading directory:', error); - throw error; - } -} - -// 读取 JSON 元数据文件中的 DOI -async function getDoiFromMetadata(pdfPath) { - try { - const jsonPath = pdfPath.replace('.pdf', '.json'); - console.log(`🔍 Looking for metadata file: ${jsonPath}`); - const jsonData = await fs.readFile(jsonPath, 'utf8'); - const metadata = JSON.parse(jsonData); - if (!metadata.doi) throw new Error(`No DOI found in metadata file: ${jsonPath}`); - console.log(`✅ Found DOI: ${metadata.doi}`); - return metadata.doi; - } catch (error) { - console.error(`❌ Error getting DOI from metadata:`, error); - throw error; - } -} - -// 上传单个 PDF(不再切片) -const uploadPdf = async (inputPath, doi) => { - try { - console.log(`\n📄 Processing PDF: ${path.basename(inputPath)}`); - - // 1. 检查是否已上传过 - const query = ` - query { - transactions( - tags: [ - { name: "App-Name", values: ["scivault"] }, - { name: "Content-Type", values: ["application/pdf"] }, - { name: "Version", values: ["2.0.0"] }, - { name: "doi", values: ["${doi}"] } - ] - ) { - edges { - node { id } - } - } - } - `; - - const response = await fetch("https://uploader.irys.xyz/graphql", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query }) - }); - - const result = await response.json(); - if (result.data?.transactions?.edges?.[0]?.node?.id) { - console.log(`⚠️ PDF already uploaded for DOI: ${doi}`); - return result.data.transactions.edges.map(edge => edge.node.id); - } - - // 2. 上传 PDF - const irys = await getIrysUploader(); - if (!irys) throw new Error("Failed to initialize Irys uploader"); - - const buffer = await fs.readFile(inputPath); - const tags = [ - { name: "App-Name", value: "scivault" }, - { name: "Content-Type", value: "application/pdf" }, - { name: "Version", value: "2.0.0" }, - { name: "doi", value: doi } - ]; - - const receipt = await irys.upload(buffer, { tags }); - console.log(`✅ PDF uploaded successfully. Transaction ID: ${receipt.id}`); - return [receipt.id]; - - } catch (error) { - console.error(`❌ Error uploading PDF: ${error.message}`); - throw error; - } -}; - -// 错误记录 -async function logError(filePath, error, doi = null) { - const errorLogPath = path.join(process.cwd(), 'upload_errors.json'); - try { - let errorLog = []; - try { - const existingLog = await fs.readFile(errorLogPath, 'utf8'); - errorLog = JSON.parse(existingLog); - } catch {} - - errorLog.push({ - timestamp: new Date().toISOString(), - file: filePath, - doi: doi, - error: error.message || String(error), - stack: error.stack - }); - - await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2)); - console.log(`📝 Error logged to ${errorLogPath}`); - } catch (logError) { - console.error('❌ Failed to log error:', logError); - } -} - -// 批量上传 PDF 主函数 -const uploadPdfs = async (pdfDir) => { - try { - const files = await walkDir(pdfDir); - console.log(`\n📁 Found ${files.length} PDF files in ${pdfDir}`); - - let successCount = 0; - let failCount = 0; - let errorFiles = []; - - for (let i = 0; i < files.length; i++) { - const pdfFile = files[i]; - let doi = null; - try { - doi = await getDoiFromMetadata(pdfFile); - await uploadPdf(pdfFile, doi); - successCount++; - } catch (error) { - failCount++; - await logError(pdfFile, error, doi); - errorFiles.push({ file: pdfFile, doi: doi, error: error.message }); - } - - if ((i + 1) % 5 === 0 || i === files.length - 1) { - console.log(`\n📊 Progress Report:`); - console.log(` ✅ Success: ${successCount}`); - console.log(` ❌ Failed: ${failCount}`); - console.log(` 🔄 Progress: ${Math.round((i + 1) / files.length * 100)}%`); - } - } - - // 写入报告 - const report = { - timestamp: new Date().toISOString(), - totalFiles: files.length, - successCount, - failCount, - successRate: `${Math.round(successCount / files.length * 100)}%`, - failedFiles: errorFiles - }; - - const reportPath = path.join(process.cwd(), 'upload_report.json'); - await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); - - console.log(`\n🎉 Upload Complete`); - console.log(` ✅ Total Success: ${successCount}`); - console.log(` ❌ Total Failed: ${failCount}`); - console.log(` 📄 Report saved to: ${reportPath}`); - if (failCount > 0) { - console.log(` 📌 Error log saved to: upload_errors.json`); - } - - } catch (error) { - console.error("❌ Error in upload process:", error); - await logError('global', error); - } -}; - -// CLI 执行入口 -if (require.main === module) { - const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); - uploadPdfs(metadataDir).catch(console.error); -} - -// 可导出函数供其他模块调用 -module.exports = { - getIrysUploader, - uploadPdf, - uploadPdfs -}; diff --git a/3_generate_basic_metadata.js b/3_generate_basic_metadata.js new file mode 100644 index 0000000..26335f5 --- /dev/null +++ b/3_generate_basic_metadata.js @@ -0,0 +1,99 @@ +const fs = require('fs'); +const path = require('path'); +const axios = require('axios'); + +// === Configuration === +const PDF_BASE_DIR = './pdf'; +const OPENALEX_BASE_URL = 'https://api.openalex.org/works/doi:'; +const DELAY_MS = 1500; + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Utilities === +const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); + +// Convert inverted index to plain abstract text +const parseAbstract = (index) => { + if (!index || typeof index !== 'object') return ''; + const words = []; + for (const [word, positions] of Object.entries(index)) { + positions.forEach(pos => { + words[pos] = word; + }); + } + return words.join(' '); +}; + +// Extract only essential metadata fields +const extractMetadata = (data) => { + const title = data.title || data.display_name || ''; + const authors = (data.authorships || []) + .map(a => a.author?.display_name) + .filter(Boolean) + .join(', '); + const abstract = parseAbstract(data.abstract_inverted_index); + const doi = data.doi?.replace('https://doi.org/', '') || ''; + const aid = data.id?.replace('https://openalex.org/', '') || ''; + return { title, authors, abstract, doi, aid }; +}; + +// Process all PDFs in a single page folder +async function generateMetadataForPage(pageDir) { + const pageNum = pageDir.match(/\d+/)[0]; + console.log(`\n📁 Processing folder: page_${pageNum}`); + + const pdfFiles = fs.readdirSync(pageDir).filter(f => f.endsWith('.pdf')); + const metadataList = []; + + for (const file of pdfFiles) { + const doiEncoded = file.replace(/\.pdf$/, ''); + const doi = decodeURIComponent(doiEncoded); + const openalexUrl = `${OPENALEX_BASE_URL}${doi}`; + + try { + console.log(`🔍 Fetching metadata for DOI: ${doi}`); + const response = await axios.get(openalexUrl); + const metadata = extractMetadata(response.data); + metadataList.push(metadata); + } catch (error) { + console.warn(`⚠️ Failed to fetch metadata for ${doi}: ${error.message}`); + } + + await sleep(DELAY_MS); + } + + const outputPath = path.join(pageDir, 'basic_metadata.json'); + fs.writeFileSync(outputPath, JSON.stringify(metadataList, null, 2)); + console.log(`✅ Saved metadata to ${outputPath}`); +} + +// === Main Function === +async function main() { + const subdirs = fs.readdirSync(PDF_BASE_DIR) + .filter(d => d.startsWith('page_')) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])) + .filter(d => { + const page = parseInt(d.match(/\d+/)[0], 10); + if (cliStart && page < cliStart) return false; + if (cliEnd && page > cliEnd) return false; + return true; + }) + .map(d => path.join(PDF_BASE_DIR, d)) + .filter(d => fs.statSync(d).isDirectory()); + + for (const pageDir of subdirs) { + await generateMetadataForPage(pageDir); + } + + console.log('\n🎉 Metadata generation completed for all selected folders.'); +} + +main(); diff --git a/3_upload_all_metadata.js b/3_upload_all_metadata.js deleted file mode 100644 index 2947955..0000000 --- a/3_upload_all_metadata.js +++ /dev/null @@ -1,200 +0,0 @@ -require("dotenv").config(); -const { Uploader } = require("@irys/upload"); -const { Solana } = require("@irys/upload-solana"); -const fs = require("fs").promises; -const path = require("path"); - -const getIrysUploader = async () => { - try { - const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); - console.log("Irys uploader initialized."); - return irysUploader; - } catch (error) { - console.error("Failed to initialize Irys uploader:", error); - return null; - } -}; - -async function walkDir(dir) { - try { - const files = await fs.readdir(dir); - const jsonFiles = files.filter(file => file.toLowerCase().endsWith('.json')); - return jsonFiles.map(file => path.join(dir, file)); - } catch (error) { - console.error('Error reading directory:', error); - throw error; - } -} - -async function uploadMetadata(jsonPath) { - try { - console.log(`\n📄 Processing metadata: ${path.basename(jsonPath)}`); - - // Read and parse JSON file - const jsonData = await fs.readFile(jsonPath, 'utf8'); - const metadata = JSON.parse(jsonData); - - if (!metadata.doi) { - throw new Error(`No DOI found in metadata file: ${jsonPath}`); - } - - // Check if metadata was already uploaded - const query = ` - query { - transactions( - tags: [ - { name: "Content-Type", values: ["metadata/json"] }, - { name: "App-Name", values: ["scivault"] }, - { name: "Version", values: ["1.0.3"] }, - { name: "doi", values: ["${metadata.doi}"] } - ] - ) { - edges { - node { - id - } - } - } - } - `; - - const response = await fetch("https://uploader.irys.xyz/graphql", { - method: "POST", - headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query }) - }); - - const result = await response.json(); - if (result.data?.transactions?.edges?.[0]?.node?.id) { - console.log(`⚠️ Metadata already exists for DOI: ${metadata.doi}`); - return result.data.transactions.edges[0].node.id; - } - - // Upload metadata - const irys = await getIrysUploader(); - if (!irys) { - throw new Error("Failed to initialize Irys uploader"); - } - - const tags = [ - { name: "Content-Type", value: "metadata/json" }, - { name: "App-Name", value: "scivault" }, - { name: "Version", value: "1.0.3" } - ]; - - for (const [key, value] of Object.entries(metadata)) { - if (value && typeof value === 'string') { - tags.push({ name: key, value: value }); - } - } - - const receipt = await irys.upload(jsonData, { tags }); - console.log(`✅ Metadata uploaded: ${receipt.id}`); - return receipt.id; - - } catch (error) { - console.error(`❌ Error processing metadata: ${error.message}`); - throw error; - } -} - -async function logError(filePath, error, doi = null) { - const errorLogPath = path.join(process.cwd(), 'metadata_upload_errors.json'); - try { - let errorLog = []; - try { - const existingLog = await fs.readFile(errorLogPath, 'utf8'); - errorLog = JSON.parse(existingLog); - } catch (e) { - // File doesn't exist, use empty array - } - - errorLog.push({ - timestamp: new Date().toISOString(), - file: filePath, - doi: doi, - error: error.message || String(error), - stack: error.stack - }); - - await fs.writeFile(errorLogPath, JSON.stringify(errorLog, null, 2)); - console.log(`Error logged to ${errorLogPath}`); - } catch (logError) { - console.error('Failed to log error:', logError); - } -} - -const uploadAllMetadata = async (metadataDir) => { - try { - const files = await walkDir(metadataDir); - console.log(`Found ${files.length} JSON files in ${metadataDir}`); - - let successCount = 0; - let failCount = 0; - let errorFiles = []; - - for (let i = 0; i < files.length; i++) { - const jsonFile = files[i]; - let doi = null; - try { - const jsonData = await fs.readFile(jsonFile, 'utf8'); - const metadata = JSON.parse(jsonData); - doi = metadata.doi; - - await uploadMetadata(jsonFile); - successCount++; - } catch (error) { - failCount++; - await logError(jsonFile, error, doi); - errorFiles.push({ - file: jsonFile, - doi: doi, - error: error.message - }); - } - - if ((i + 1) % 5 === 0 || i === files.length - 1) { - console.log(`\n📊 Progress Report:`); - console.log(` Success: ${successCount}`); - console.log(` Failed: ${failCount}`); - console.log(` Progress: ${Math.round((i + 1) / files.length * 100)}%`); - } - } - - const report = { - timestamp: new Date().toISOString(), - totalFiles: files.length, - successCount, - failCount, - successRate: `${Math.round(successCount / files.length * 100)}%`, - failedFiles: errorFiles - }; - - const reportPath = path.join(process.cwd(), 'metadata_upload_report.json'); - await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); - - console.log(`\n🎉 Upload Complete`); - console.log(` Total Success: ${successCount}`); - console.log(` Total Failed: ${failCount}`); - console.log(` Success Rate: ${Math.round(successCount / files.length * 100)}%`); - console.log(` Detailed report saved to: ${reportPath}`); - if (failCount > 0) { - console.log(` Error log saved to: metadata_upload_errors.json`); - } - - } catch (error) { - console.error("❌ Error in upload process:", error); - await logError('global', error); - } -}; - -if (require.main === module) { - const metadataDir = process.argv[2] || path.join(process.cwd(), 'metadata'); - uploadAllMetadata(metadataDir).catch(console.error); -} - -module.exports = { - getIrysUploader, - uploadMetadata, - uploadAllMetadata -}; diff --git a/4_upload_all_basic_metadata.js b/4_upload_all_basic_metadata.js new file mode 100644 index 0000000..7184ca2 --- /dev/null +++ b/4_upload_all_basic_metadata.js @@ -0,0 +1,131 @@ +require("dotenv").config(); +const { Uploader } = require("@irys/upload"); +const { Solana } = require("@irys/upload-solana"); +const fs = require("fs").promises; +const path = require("path"); + +// === Configuration === +const PDF_BASE_DIR = './pdf'; +const REPORT_FILENAME = 'upload_basic_metadata_report.txt'; + +// === CLI Argument Parser === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Initialize Irys uploader === +const getIrysUploader = async () => { + try { + const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); + console.log("✅ Irys uploader initialized."); + return irysUploader; + } catch (error) { + console.error("❌ Failed to initialize Irys uploader:", error); + return null; + } +}; + +// === Upload a single paper === +const uploadOneMetadata = async (irys, paper, pageNum, index) => { + if (!paper.doi) { + console.log(`⚠️ Skipping paper at page ${pageNum}, index ${index}: No DOI`); + return { ok: false, reason: 'no-doi' }; + } + + try { + const normalizedDoi = paper.doi.trim(); + const normalizedTitle = (paper.title || "").replace(/\s+/g, ' ').trim(); + const normalizedAuthors = (paper.authors || "").replace(/\s+/g, ' ').trim(); + + const tags = [ + { name: "App-Name", value: "scivault" }, + { name: "Content-Type", value: "application/json" }, + { name: "Version", value: "2.0.0" }, + { name: "doi", value: normalizedDoi }, + { name: "title", value: normalizedTitle }, + { name: "authors", value: normalizedAuthors }, + { name: "aid", value: paper.aid || "" } + ]; + + const buffer = Buffer.from(JSON.stringify(paper)); + const receipt = await irys.upload(buffer, { tags }); + + console.log(`✅ Uploaded [page_${pageNum} - ${index}]: ${normalizedDoi} (${receipt.id})`); + return { ok: true, id: receipt.id }; + } catch (err) { + console.error(`❌ Upload failed [page_${pageNum} - ${index}]: ${paper.doi} - ${err.message}`); + return { ok: false, reason: err.message }; + } +}; + +// === Process one page folder === +const uploadPageFolder = async (irys, pageDir) => { + const pageNum = pageDir.match(/\d+/)?.[0] || '?'; + const metaPath = path.join(PDF_BASE_DIR, pageDir, 'basic_metadata.json'); + const reportPath = path.join(PDF_BASE_DIR, pageDir, REPORT_FILENAME); + + try { + await fs.access(metaPath); + } catch { + console.warn(`⚠️ Skipping page_${pageNum}: no basic_metadata.json`); + return; + } + + const jsonText = await fs.readFile(metaPath, 'utf8'); + const papers = JSON.parse(jsonText); + + console.log(`\n📄 Found ${papers.length} papers in page_${pageNum}`); + const reportLines = []; + + let success = 0; + let fail = 0; + + for (let i = 0; i < papers.length; i++) { + const result = await uploadOneMetadata(irys, papers[i], pageNum, i); + const doi = papers[i].doi || '[no-doi]'; + + if (result.ok) { + success++; + reportLines.push(`✅ ${doi} : ${result.id}`); + } else { + fail++; + reportLines.push(`❌ ${doi} : ${result.reason}`); + } + + if ((i + 1) % 10 === 0 || i === papers.length - 1) { + console.log(`📊 page_${pageNum} progress: ${i + 1}/${papers.length}, ✅ ${success}, ❌ ${fail}`); + } + } + + await fs.writeFile(reportPath, reportLines.join('\n'), 'utf8'); + console.log(`📄 Upload report saved: ${reportPath}`); + console.log(`✨ Finished page_${pageNum}: ✅ ${success}, ❌ ${fail}`); +}; + +// === Main Execution === +(async () => { + const irys = await getIrysUploader(); + if (!irys) return; + + const dirs = await fs.readdir(PDF_BASE_DIR); + const pageDirs = dirs + .filter(d => d.startsWith('page_')) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])) + .filter(d => { + const page = parseInt(d.match(/\d+/)[0], 10); + if (cliStart && page < cliStart) return false; + if (cliEnd && page > cliEnd) return false; + return true; + }); + + for (const pageDir of pageDirs) { + await uploadPageFolder(irys, pageDir); + } + + console.log('\n🎉 All basic metadata uploads completed.'); +})(); diff --git a/5_upload_all_pdfs.js b/5_upload_all_pdfs.js new file mode 100644 index 0000000..a38880c --- /dev/null +++ b/5_upload_all_pdfs.js @@ -0,0 +1,168 @@ +require("dotenv").config(); +const { Uploader } = require("@irys/upload"); +const { Solana } = require("@irys/upload-solana"); +const fs = require("fs").promises; +const path = require("path"); + +// === CONFIG === +const BASE_PDF_DIR = path.join(process.cwd(), "pdf"); +const MIN_VALID_SIZE = 1000; // in bytes +const REPORT_PREFIX = "upload_pdf_report"; + +// === CLI === +const args = process.argv.slice(2); +const getArg = (name) => { + const prefix = `--${name}=`; + const found = args.find(arg => arg.startsWith(prefix)); + return found ? parseInt(found.slice(prefix.length), 10) : undefined; +}; +const cliStart = getArg("start-page"); +const cliEnd = getArg("end-page"); + +// === Uploader === +const getIrysUploader = async () => { + try { + const irysUploader = await Uploader(Solana).withWallet(process.env.PRIVATE_KEY); + console.log("✅ Irys uploader initialized."); + return irysUploader; + } catch (error) { + console.error("❌ Failed to initialize Irys uploader:", error); + return null; + } +}; + +// === DOI Utilities === +function extractDoiFromFilename(filename) { + const base = path.basename(filename, ".pdf"); + return decodeURIComponent(base).replace(/%2F/g, "/").trim(); +} + +// === Check existing upload === +async function checkIfAlreadyUploaded(doi) { + const query = ` + query { + transactions( + tags: [ + { name: "App-Name", values: ["scivault"] }, + { name: "Content-Type", values: ["application/pdf"] }, + { name: "Version", values: ["2.0.0"] }, + { name: "doi", values: ["${doi}"] } + ] + ) { + edges { + node { id } + } + } + } + `; + + const response = await fetch("https://uploader.irys.xyz/graphql", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query }) + }); + + const result = await response.json(); + return result.data?.transactions?.edges?.[0]?.node?.id || null; +} + +// === Upload one PDF === +async function uploadOnePdf(irys, filePath) { + try { + const doi = extractDoiFromFilename(filePath); + if (!doi) throw new Error("Invalid DOI from filename"); + + const alreadyUploaded = await checkIfAlreadyUploaded(doi); + if (alreadyUploaded) { + console.log(`⚠️ Already uploaded: ${doi}`); + return { status: "skip", doi }; + } + + const buffer = await fs.readFile(filePath); + if (buffer.length < MIN_VALID_SIZE) { + throw new Error("File too small (<1KB)"); + } + + const tags = [ + { name: "App-Name", value: "scivault" }, + { name: "Content-Type", value: "application/pdf" }, + { name: "Version", value: "2.0.0" }, + { name: "doi", value: doi } + ]; + + const receipt = await irys.upload(buffer, { tags }); + console.log(`✅ Uploaded ${doi} - ${receipt.id}`); + return { status: "ok", doi, id: receipt.id }; + } catch (error) { + console.error(`❌ Failed upload: ${filePath} - ${error.message}`); + return { status: "fail", file: filePath, error: error.message }; + } +} + +// === Process one page folder === +async function processPageFolder(irys, pageDir) { + const pageNum = pageDir.match(/page_(\d+)/)?.[1]; + const files = await fs.readdir(pageDir); + const pdfFiles = files.filter(f => f.endsWith(".pdf")); + + console.log(`📂 Processing page_${pageNum} - Found ${pdfFiles.length} PDFs`); + + const result = { ok: [], fail: [], skip: [] }; + + for (let i = 0; i < pdfFiles.length; i++) { + const file = pdfFiles[i]; + const filePath = path.join(pageDir, file); + const res = await uploadOnePdf(irys, filePath); + + if (res.status === "ok") result.ok.push(res); + else if (res.status === "fail") result.fail.push(res); + else if (res.status === "skip") result.skip.push(res); + + if ((i + 1) % 10 === 0 || i === pdfFiles.length - 1) { + console.log(`📊 Progress: ${i + 1}/${pdfFiles.length}`); + } + } + + // Save report + const report = { + page: `page_${pageNum}`, + timestamp: new Date().toISOString(), + total: pdfFiles.length, + success: result.ok.length, + failed: result.fail.length, + skipped: result.skip.length, + successRate: `${Math.round((result.ok.length / pdfFiles.length) * 100)}%`, + details: result + }; + + const reportPath = path.join(pageDir, `${REPORT_PREFIX}_page_${pageNum}.json`); + await fs.writeFile(reportPath, JSON.stringify(report, null, 2)); + console.log(`📝 Report saved to ${reportPath}`); +} + +// === Main === +(async () => { + const irys = await getIrysUploader(); + if (!irys) return; + + const dirs = await fs.readdir(BASE_PDF_DIR); + const pageDirs = dirs + .filter(d => d.startsWith("page_")) + .sort((a, b) => parseInt(a.match(/\d+/)[0]) - parseInt(b.match(/\d+/)[0])) + .filter(d => { + const page = parseInt(d.match(/\d+/)[0]); + if (cliStart && page < cliStart) return false; + if (cliEnd && page > cliEnd) return false; + return true; + }); + + for (const dir of pageDirs) { + const fullPath = path.join(BASE_PDF_DIR, dir); + const stat = await fs.lstat(fullPath); + if (stat.isDirectory()) { + await processPageFolder(irys, fullPath); + } + } + + console.log("\n🎉 All PDF uploads completed."); +})(); diff --git a/fund.js b/9_fund.js similarity index 100% rename from fund.js rename to 9_fund.js diff --git a/package.json b/package.json index a2fb714..03008c1 100644 --- a/package.json +++ b/package.json @@ -12,13 +12,17 @@ "dependencies": { "@irys/upload": "^0.0.14", "@irys/upload-solana": "^0.1.7", + "axios": "^1.9.0", "bignumber.js": "^9.1.2", "cors": "^2.8.5", "dotenv": "^16.4.7", "express": "^4.17.1", + "jsdom": "^26.1.0", + "minimist": "^1.2.8", "node-fetch": "^2.7.0", "pdf-lib": "^1.17.1", - "pdfkit": "^0.16.0" + "pdfkit": "^0.16.0", + "puppeteer": "^24.10.0" }, "keywords": [ "arweave", @@ -29,7 +33,6 @@ ], "author": "SciVault", "license": "MIT", - "devDependencies": {}, "repository": { "type": "git", "url": "git+https://github.com/Scihub-Community/sciuploader.git" From adda4ddbc9e0dd86dcd729bcbfc767420957765b Mon Sep 17 00:00:00 2001 From: heinzhex Date: Tue, 3 Jun 2025 19:47:11 +0900 Subject: [PATCH 5/8] Refactor: restructure script flow and replace old upload logic --- README.md | 127 +++++++++++++++++++++++++++++------------------------- 1 file changed, 69 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 1715ff0..2d851b9 100644 --- a/README.md +++ b/README.md @@ -1,86 +1,97 @@ -# uploader for SciBox +# 📄 SciUploader – Bulk Sci-Hub PDF Downloader -A decentralized academic paper repository system built on Arweave/Irys. +This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage (e.g., Arweave/Irys). -## Prerequisites +--- -1. Node.js (v16 or higher) -2. Solana wallet with SOL tokens -3. Create a `.env` file with your Solana private key: - ``` - PRIVATE_KEY=your_solana_private_key_here - ``` +## 📦 Project Structure -## Installation +``` +sciuploader/ +├── doi/ ← Each page_N.json contains a list of DOIs +├── pdf/ ← Downloaded PDFs organized by page +├── 0_run_workflow.js ← Run full workflow script +├── 1_fetch_all_dois.js ← Fetch DOI list from external source +├── 2_fetch_all_pdfs.js ← Download PDFs using DOI list +├── 3_generate_basic_metadata.js ← Generate basic metadata JSON +├── 4_upload_all_basic_metadata.js ← Upload metadata to decentralized storage (TBD) +├── 5_upload_all_pdfs.js ← Upload PDFs to decentralized storage (TBD) +├── 9_fund.js ← Funding registration or helper functions +├── .env.example ← Example environment configuration +└── README.md ← This file +``` + +--- -1. Clone this repository: - ```bash - git clone https://github.com/SciVault/sciuploader - cd sciuploader - ``` +## ✅ How to Use -2. Install dependencies: - ```bash - npm install - ``` +### 1. Install dependencies + +```bash +npm install +``` -## Usage +### 2. Set environment variables (optional) -### Step 0: Prepare Your Data +Copy `.env.example` to `.env` and fill in any required values (e.g., upload keys for later stages). -1. Create a `metadata` folder in the project root -2. Place your metadata JSON files and corresponding PDFs in this folder - - Each PDF should have a matching JSON file with the same name (e.g., `paper1.pdf` and `paper1.json`) - - JSON files must contain a `doi` field -3. Run the metadata generator: - ```bash - node 0_generate_basic_metadata.js - ``` - This will create a `basic_metadata.json` file containing essential paper information. +--- -### Step 1: Upload Basic Metadata +### 3. Run full workflow -Upload the basic metadata (title, authors, DOI, etc.): ```bash -node 1_upload_basic_metadata.js +node 0_run_workflow.js ``` -### Step 2: Upload PDFs +for dividing tasks, +add --start-page=3 --end-page=4 like this, there are total 883431 pages -Upload PDFs (they will be automatically split into chunks): ```bash -node 2_upload_pdf.js +node 0_run_workflow.js --start-page=3 --end-page=4 ``` -Note: If uploads fail due to network issues, you can safely run the script again. It will skip already uploaded files and continue with failed ones. +Or run step-by-step: -## Version Control +--- -The system uses semantic versioning for content management: -- Current version: `2.0.0` -- Format: `MAJOR.MINOR.PATCH` - - MAJOR: Breaking changes - - MINOR: New features - - PATCH: Bug fixes +### ◾️ Step 1: Fetch all DOIs (optional) -When uploading content, ensure you're using the correct version in the tags. +```bash +node 1_fetch_all_dois.js +``` + +This fetches DOIs from an API and saves them into `doi/page_N.json` files. -## Error Handling +--- -- Each upload script generates detailed logs: - - `upload_report.json`: Summary of upload results - - `upload_errors.json`: Details of failed uploads -- Failed uploads can be retried by running the script again -- The system checks for existing uploads to avoid duplicates +### ◾️ Step 2: Download all PDFs -## Web Interface +```bash +node 2_fetch_all_pdfs.js --start-page=1 --end-page=10 +``` -The `queryweb` folder contains a simple web interface for searching and viewing papers: -- Search by DOI, title, or arXiv ID -- View paper metadata -- Download PDF files +- Failed downloads are logged to `failed_log_page_N.txt` per page. +- Already downloaded and valid files are skipped. + +--- + +### ◾️ Step 3: Generate basic metadata + +```bash +node 3_generate_basic_metadata.js +``` + +--- + +### ◾️ Step 4 & 5: Upload + +```bash +node 4_upload_all_basic_metadata.js +node 5_upload_all_pdfs.js +``` +--- -## License +## 📜 License -MIT \ No newline at end of file +MIT \ No newline at end of file From 3854483142f6c5423f7de1a5ca687470940a0cef Mon Sep 17 00:00:00 2001 From: heinzhex Date: Fri, 6 Jun 2025 17:14:15 +0900 Subject: [PATCH 6/8] script update now you can run node 0_run_workflow.js --start-page=300000 --end-page=400000 --batch-size=10 --- 0_run_workflow.js | 94 +++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 31 deletions(-) diff --git a/0_run_workflow.js b/0_run_workflow.js index 5932947..3417867 100644 --- a/0_run_workflow.js +++ b/0_run_workflow.js @@ -1,57 +1,89 @@ -// 0_run_workflow.js const { execSync } = require("child_process"); +const fs = require("fs"); +const path = require("path"); -// Get CLI arguments +// === CLI Argument Parser === const args = process.argv.slice(2); const getArg = (name) => { const prefix = `--${name}=`; const found = args.find((arg) => arg.startsWith(prefix)); return found ? parseInt(found.slice(prefix.length), 10) : undefined; }; - const startPage = getArg("start-page"); const endPage = getArg("end-page"); +const batchSize = getArg("batch-size") || 10; if (!startPage || !endPage || isNaN(startPage) || isNaN(endPage)) { - console.error("❌ Missing or invalid arguments. Usage: node 0_run_workflow.js --start-page=3 --end-page=4"); + console.error("❌ Usage: node 0_run_workflow.js --start-page=10 --end-page=100 --batch-size=10"); process.exit(1); } -console.log(`🚀 Starting workflow from page ${startPage} to ${endPage}\n`); - -const steps = [ - { - name: "📥 Step 1️⃣: Fetching DOI JSON...", - command: `node 1_fetch_all_dois.js --start-page=${startPage} --end-page=${endPage}`, - }, - { - name: "📄 Step 2️⃣: Downloading PDFs...", - command: `node 2_fetch_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`, - }, - { - name: "🧠 Step 3️⃣: Generating metadata...", - command: `node 3_generate_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`, - }, - { - name: "🆙 Step 4️⃣: Uploading metadata to Irys...", - command: `node 4_upload_all_basic_metadata.js --start-page=${startPage} --end-page=${endPage}`, - }, - { - name: "📤 Step 5️⃣: Uploading PDFs to Irys...", - command: `node 5_upload_all_pdfs.js --start-page=${startPage} --end-page=${endPage}`, - }, -]; +function deletePdfFolder(page) { + const dirPath = path.join("pdf", `page_${page}`); + if (fs.existsSync(dirPath)) { + const files = fs.readdirSync(dirPath); + for (const file of files) { + if (file.endsWith(".pdf")) { + fs.unlinkSync(path.join(dirPath, file)); + } + } + console.log(`🧹 Deleted PDF files in folder: ${dirPath}`); + } +} + +async function runWorkflowBatch(batchStart, batchEnd) { + console.log(`\n🚀 Starting workflow for pages ${batchStart} - ${batchEnd}\n`); + const steps = [ + { + name: "📥 Step 1️⃣: Fetching DOI JSON...", + command: `node 1_fetch_all_dois.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "📄 Step 2️⃣: Downloading PDFs...", + command: `node 2_fetch_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "🧠 Step 3️⃣: Generating metadata...", + command: `node 3_generate_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "🆙 Step 4️⃣: Uploading metadata to Irys...", + command: `node 4_upload_all_basic_metadata.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + { + name: "📤 Step 5️⃣: Uploading PDFs to Irys...", + command: `node 5_upload_all_pdfs.js --start-page=${batchStart} --end-page=${batchEnd}`, + }, + ]; -(async () => { for (const step of steps) { console.log(`\n${step.name}`); try { execSync(step.command, { stdio: "inherit" }); } catch (err) { - console.error(`❌ Workflow failed: ${err.message}`); + console.error(`❌ Step failed: ${err.message}`); + return false; + } + } + + // cleanup pdf files in each page folder + for (let page = batchStart; page <= batchEnd; page++) { + deletePdfFolder(page); + } + + return true; +} + +(async () => { + for (let i = startPage; i <= endPage; i += batchSize) { + const batchStart = i; + const batchEnd = Math.min(endPage, i + batchSize - 1); + const success = await runWorkflowBatch(batchStart, batchEnd); + if (!success) { + console.error(`❌ Stopping workflow due to error in batch ${batchStart}-${batchEnd}`); process.exit(1); } } - console.log("\n✅ All steps completed successfully!"); + console.log("\n✅ All batches completed successfully!"); })(); From 65d2a34eccf218f261fa4f8d95b4adfb48e18359 Mon Sep 17 00:00:00 2001 From: heinzhex Date: Fri, 6 Jun 2025 17:17:28 +0900 Subject: [PATCH 7/8] script update now you can run node 0_run_workflow.js --start-page=300000 --end-page=400000 --batch-size=10 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d851b9..23a2701 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ for dividing tasks, add --start-page=3 --end-page=4 like this, there are total 883431 pages ```bash -node 0_run_workflow.js --start-page=3 --end-page=4 +node 0_run_workflow.js --start-page=300000 --end-page=400000 --batch-size=10 ``` From 24aa52c6fac2c758779defc7fdb9a743fbdf7bbe Mon Sep 17 00:00:00 2001 From: heinzhex Date: Fri, 6 Jun 2025 17:20:58 +0900 Subject: [PATCH 8/8] script update now you can run node 0_run_workflow.js --start-page=300000 --end-page=400000 --batch-size=10 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 23a2701..5677cf8 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # 📄 SciUploader – Bulk Sci-Hub PDF Downloader -This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage (e.g., Arweave/Irys). +This tool automates the batch download of academic papers from Sci-Hub using DOIs and organizes the PDFs for further metadata processing and decentralized storage Irys. ---