diff --git a/src/paperbot/utils/CCS-DOWN.py b/src/paperbot/utils/CCS-DOWN.py deleted file mode 100644 index edb5397f..00000000 --- a/src/paperbot/utils/CCS-DOWN.py +++ /dev/null @@ -1,1052 +0,0 @@ -# securipaperbot/utils/downloader.py - -from typing import Dict, List, Any, Optional -import aiohttp -import asyncio -import httpx -from pathlib import Path -import urllib.parse -from bs4 import BeautifulSoup -import re -import json -import time -import random -from datetime import datetime -import logging -import traceback - -# 添加动态cookie获取支持 -import traceback -try: - # curl_cffi 0.5.x 版本中, AsyncSession 位于 requests 模块下 - from curl_cffi.requests import AsyncSession - CURL_CFFI_AVAILABLE = True -except ImportError: - from typing import Any as AsyncSession # Mock for type hinting - CURL_CFFI_AVAILABLE = False - print("❌ 'curl_cffi' 导入失败。详细错误信息如下:") - traceback.print_exc() - print("警告: curl_cffi 未安装或无法加载,动态cookie获取功能(如ACM)将受限") - -try: - import cloudscraper - CLOUDSCRAPER_AVAILABLE = True -except ImportError: - CLOUDSCRAPER_AVAILABLE = False - print("警告: cloudscraper 未安装,动态cookie获取功能(如ACM)将受限") - - -# 使用标准日志,避免相对导入问题 -def setup_logger(name): - logger = logging.getLogger(name) - if not logger.handlers: - logger.setLevel(logging.INFO) - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger - - -class PaperDownloader: - """论文下载工具类 - 优化版本,使用持久化会话""" - - # 会议基本信息配置 - CONFERENCE_INFO = { - 'sp': { - 'base_url': 'https://ieeexplore.ieee.org/xpl/conhome/1000487/all-proceedings', - 'headers': { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0' - } - }, - 'ndss': { - 'base_url': 'https://www.ndss-symposium.org', - 'headers': { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0' - } - }, - 'usenix': { - 'base_url': 'https://www.usenix.org/conference/usenixsecurity', - 'headers': { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0' - } - } - } - - async def _download_with_retry(self, url: str) -> Optional[bytes]: - """ - 智能下载实现,带自动重试和反爬处理。 - - Args: - url (str): 要下载的URL - - Returns: - Optional[bytes]: 下载的内容,失败返回None - """ - # 验证并确保会话可用 - if not self.session: - try: - self.logger.info("正在重新创建持久化会话...") - self.session = AsyncSession() - except Exception as e: - self.logger.error(f"创建持久化会话失败: {e}") - return None - - last_error = None - content = None - - for attempt in range(1, self.max_retries + 1): - try: - self.logger.info(f"下载尝试 {attempt}/{self.max_retries}: {url}") - - # 配置特殊headers以绕过反爬 - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - # 验证会话状态 - if not self.session or getattr(self.session, '_closed', False): - self.logger.warning("会话已关闭,正在重新创建...") - self.session = AsyncSession() - - # 使用curl_cffi的持久化会话和浏览器仿真 - response = await self.session.get( - url, - impersonate="chrome110", - headers=headers, - timeout=60 - ) - - # 检查HTTP状态码 - if response.status_code == 403: - self.logger.warning(f"遇到403 Forbidden,可能是反爬限制 (尝试 {attempt}/{self.max_retries})") - await asyncio.sleep(self.retry_delay * attempt) # 指数退避 - continue - - elif response.status_code == 429: - self.logger.warning(f"遇到429 Too Many Requests,开始等待 (尝试 {attempt}/{self.max_retries})") - await asyncio.sleep(self.retry_delay * 2 * attempt) # 更长的等待 - continue - - elif response.status_code != 200: - self.logger.warning(f"HTTP {response.status_code} (尝试 {attempt}/{self.max_retries})") - await asyncio.sleep(self.retry_delay) - continue - - # 获取响应内容 - content = response.content - - # 验证内容 - if not content or len(content) < 1024: # 小于1KB可能是错误页面 - self.logger.warning(f"响应内容过小: {len(content) if content else 0} bytes") - continue - - # 对于PDF,验证文件头 - if url.lower().endswith('.pdf') and not content.startswith(b'%PDF'): - self.logger.warning("响应不是有效的PDF格式") - continue - - self.logger.info(f"✅ 成功下载: {len(content)} bytes") - return content - - except Exception as e: - last_error = e - self.logger.warning(f"下载出错 (尝试 {attempt}/{self.max_retries}): {e}") - await asyncio.sleep(self.retry_delay) - continue - - # 所有重试都失败 - if last_error: - self.logger.error(f"❌ 下载失败,已达到最大重试次数。最后错误: {last_error}") - else: - self.logger.error("❌ 下载失败,未获得有效内容") - return None - - def _sanitize_filename(self, filename: str) -> str: - """清理并规范化文件名,移除非法字符""" - # 替换 Windows 文件系统不允许的字符 - invalid_chars = r'[\\/:"*?<>|]+' - filename = re.sub(invalid_chars, '_', filename) - - # 将连续的空白字符替换为单个空格 - filename = re.sub(r'\s+', ' ', filename) - - # 去除首尾空白 - filename = filename.strip() - - # 如果文件名为空,使用默认名称 - if not filename: - filename = f"paper_{int(time.time())}" - - # 限制文件名长度(Windows 最大路径长度为 260 字符) - max_length = 200 # 留一些余地给路径和扩展名 - if len(filename) > max_length: - filename = filename[:max_length-3] + "..." - - return filename - - def __init__(self, config: Optional[Dict[str, Any]] = None): - self.config = config or {} - self.logger = setup_logger(__name__) - self.download_path = Path(self.config.get('download_path', './papers')) - self.download_path.mkdir(parents=True, exist_ok=True) - - self.session: Optional[AsyncSession] = None - - # 配置下载重试参数 - self.max_retries = self.config.get('max_retries', 3) - self.retry_delay = self.config.get('retry_delay', 3) - - # 并发控制 - max_concurrent = 1 - self.semaphore = asyncio.Semaphore(max_concurrent) - - - # 会议URL模板 - self.conference_urls = { - 'ccs': 'https://dl.acm.org/doi/proceedings/', - 'sp': 'https://ieeexplore.ieee.org/xpl/conhome/', - 'ndss': 'https://www.ndss-symposium.org/', - 'usenix': 'https://www.usenix.org/conference/' - } - - async def __aenter__(self): - """创建并返回一个持久化的 curl_cffi 会话.""" - try: - if self.session and not getattr(self.session, '_closed', False): - self.logger.info("使用现有的持久化会话...") - return self - - self.logger.info("正在创建新的持久化会话...") - self.session = AsyncSession() - return self - except Exception as e: - self.logger.error(f"创建持久化会话失败: {e}") - raise - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """关闭持久化会话.""" - try: - if self.session: - # 检查会话是否已经关闭 - is_closed = getattr(self.session, '_closed', True) - - if not is_closed and hasattr(self.session, 'close'): - try: - self.logger.info("正在关闭持久化会话...") - await self.session.close() - except Exception as e: - self.logger.warning(f"关闭会话时出现异常: {e}") - else: - self.logger.info("会话已经关闭,无需再次关闭") - except Exception as e: - self.logger.warning(f"处理会话关闭时出现异常: {e}") - finally: - # 确保会话对象被清理 - self.session = None - - async def download_paper(self, url: str, title: str, paper_index: int = 0, total_papers: int = 0) -> Dict[str, Any]: - """下载单篇论文 - 优化版本""" - async with self.semaphore: - try: - # 生成文件名 - 为IEEE论文添加特殊前缀 - safe_title = self._sanitize_filename(title) - - # 使用简化的文件名:只使用论文标题 - filename = f"{safe_title}.pdf" - - file_path = self.download_path / filename - - # 显示下载进度(与NDSS/USENIX保持一致) - if total_papers > 0: - progress = (paper_index + 1) / total_papers * 100 - print(f"💾 [{paper_index+1}/{total_papers}] 下载: {title[:50]}{'...' if len(title) > 50 else ''}") - - # 检查是否已下载并验证文件 - if file_path.exists(): - # 验证文件大小,过小的文件可能是错误页面 - file_size = file_path.stat().st_size - if file_size > 1024: # 大于1KB认为有效 - return { - 'success': True, - 'path': str(file_path), - 'cached': True, - 'size': file_size - } - else: - # 删除无效文件 - file_path.unlink() - self.logger.warning(f"Removed invalid cached file: {file_path}") - - # 下载论文 - content = await self._download_with_retry(url) - if content: - # 验证下载内容 - if len(content) < 1024: - raise Exception(f"Downloaded content too small ({len(content)} bytes), likely an error page") - - # 保存文件 - file_path.write_bytes(content) - file_size = len(content) - - return { - 'success': True, - 'path': str(file_path), - 'cached': False, - 'size': file_size - } - else: - raise Exception("Failed to download paper - no content received") - - except Exception as e: - self.logger.error(f"Error downloading paper {title}: {str(e)}") - return { - 'success': False, - 'error': str(e) - } - - async def _parse_sp_papers(self, year: str) -> List[Dict[str, Any]]: - """解析 IEEE S&P 论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - try: - print(f"📚 正在获取 IEEE S&P {full_year} 论文列表...") - conf_info = self.CONFERENCE_INFO['sp'] - base_url = f"{conf_info['base_url']}" - - # 使用会话发送请求 - if not self.session: - self.session = AsyncSession() - - response = await self.session.get( - base_url, - headers=conf_info['headers'], - impersonate="chrome110" - ) - - if response.status_code != 200: - raise Exception(f"获取会议页面失败: HTTP {response.status_code}") - - # 解析页面内容 - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.select('div.paper-item') - - for item in paper_items: - title_elem = item.select_one('h3.paper-title') - if not title_elem: - continue - - title = title_elem.text.strip() - url = item.select_one('a[href*=".pdf"]') - if not url: - continue - - pdf_url = url['href'] - if not pdf_url.startswith('http'): - pdf_url = f"https://ieeexplore.ieee.org{pdf_url}" - - papers.append({ - 'title': title, - 'url': pdf_url - }) - - print(f"✅ 找到 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 获取 IEEE S&P {full_year} 论文列表失败: {str(e)}") - return [] - - async def _parse_ndss_papers(self, year: str) -> List[Dict[str, Any]]: - """解析 NDSS 论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - try: - print(f"📚 正在获取 NDSS {full_year} 论文列表...") - conf_info = self.CONFERENCE_INFO['ndss'] - base_url = f"{conf_info['base_url']}/ndss{year}/accepted-papers" - - # 使用会话发送请求 - if not self.session: - self.session = AsyncSession() - - response = await self.session.get( - base_url, - headers=conf_info['headers'], - impersonate="chrome110" - ) - - if response.status_code != 200: - raise Exception(f"获取会议页面失败: HTTP {response.status_code}") - - # 解析页面内容 - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.select('div.paper-item, div.accepted-paper') - - for item in paper_items: - title_elem = item.select_one('h3.paper-title, h4.paper-title, div.paper-title') - if not title_elem: - continue - - title = title_elem.text.strip() - url = item.select_one('a[href*=".pdf"]') - if not url: - continue - - pdf_url = url['href'] - if not pdf_url.startswith('http'): - pdf_url = f"{conf_info['base_url']}{pdf_url}" - - papers.append({ - 'title': title, - 'url': pdf_url - }) - - print(f"✅ 找到 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 获取 NDSS {full_year} 论文列表失败: {str(e)}") - return [] - - async def _parse_usenix_papers(self, year: str) -> List[Dict[str, Any]]: - """解析 USENIX Security 论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - try: - print(f"📚 正在获取 USENIX Security {full_year} 论文列表...") - conf_info = self.CONFERENCE_INFO['usenix'] - base_url = f"{conf_info['base_url']}{full_year}/technical-sessions" - - # 使用会话发送请求 - if not self.session: - self.session = AsyncSession() - - response = await self.session.get( - base_url, - headers=conf_info['headers'], - impersonate="chrome110" - ) - - if response.status_code != 200: - raise Exception(f"获取会议页面失败: HTTP {response.status_code}") - - # 解析页面内容 - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.select('div.paper-item, div.node-paper') - - for item in paper_items: - title_elem = item.select_one('h2.node-title, div.field-title') - if not title_elem: - continue - - title = title_elem.text.strip() - url = item.select_one('a[href*=".pdf"]') - if not url: - continue - - pdf_url = url['href'] - if not pdf_url.startswith('http'): - pdf_url = f"https://www.usenix.org{pdf_url}" - - papers.append({ - 'title': title, - 'url': pdf_url - }) - - print(f"✅ 找到 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 获取 USENIX Security {full_year} 论文列表失败: {str(e)}") - return [] - - async def get_conference_papers(self, conference: str, year: str) -> List[Dict[str, Any]]: - """获取会议论文列表 - 带进度显示""" - try: - conf_info = self.CONFERENCE_INFO.get(conference) - if not conf_info and conference != 'ccs': - raise ValueError(f"不支持的会议: {conference}") - - papers = [] - print(f"🔍 正在获取 {conference.upper()} {year} 论文列表...") - - # 根据会议类型选择相应的解析方法 - if conference == 'ccs': - papers = await self._parse_ccs_papers(self.conference_urls[conference], year) - elif conference == 'sp': - papers = await self._parse_sp_papers(year) - elif conference == 'ndss': - papers = await self._parse_ndss_papers(year) - elif conference == 'usenix': - papers = await self._parse_usenix_papers(year) - - if papers: - print(f"✅ 成功获取 {len(papers)} 篇论文信息") - - # 显示找到的论文标题 - print(f"📋 找到的论文列表:") - for i, paper in enumerate(papers[:10]): - title = paper.get('title', '未知标题')[:60] - print(f" {i+1:2d}. {title}{'...' if len(paper.get('title', '')) > 60 else ''}") - - if len(papers) > 10: - print(f" ... 和其他 {len(papers) - 10} 篇论文") - - # 开始PDF链接验证与进度显示 - print(f"\n🔗 正在验证PDF链接有效性...") - valid_count = 0 - - for i, paper in enumerate(papers): - # 显示进度 - progress = (i + 1) / len(papers) * 100 - progress_bar = '█' * int(progress // 5) + '░' * (20 - int(progress // 5)) - print(f"\r📋 [进度: {progress_bar}] {progress:.1f}% ({i+1}/{len(papers)}) 验证: {paper.get('title', '未知标题')[:30]}...", end='', flush=True) - - # 检查URL有效性 - if isinstance(paper.get('url'), str) and paper['url'].strip(): - valid_count += 1 - - print(f"\n✅ PDF链接验证完成: {valid_count}/{len(papers)} 个有效链接") - else: - print(f"⚠️ 未找到任何论文") - - return papers - - except Exception as e: - self.logger.error(f"获取论文列表失败: {str(e)}") - raise - """获取会议论文列表 - 带进度显示""" - try: - if conference not in self.conference_urls: - raise ValueError(f"不支持的会议: {conference}") - - base_url = self.conference_urls[conference] - papers = [] - - print(f"🔍 正在获取 {conference.upper()} {year} 论文列表...") - - # 规范化年份格式 - year = self.year_formats[conference](year) - - # 根据会议类型选择相应的解析方法 - papers = await self._get_papers_by_conference(conference, base_url, year) - if papers: - print(f"✨ 成功获取 {len(papers)} 篇论文信息") - return papers - - except Exception as e: - self.logger.error(f"获取论文列表失败: {e}") - raise - - async def _get_papers_by_conference(self, conference: str, base_url: str, year: str) -> List[Dict[str, Any]]: - """根据会议类型获取论文列表""" - try: - if conference == 'sp': - # IEEE S&P - full_url = f"{base_url}{year}" - return await self._get_sp_papers(full_url, year) - elif conference == 'ndss': - # NDSS - return await self._get_ndss_papers(base_url, year) - elif conference == 'usenix': - # USENIX Security - return await self._get_usenix_papers(base_url, year) - elif conference == 'ccs': - # ACM CCS - return await self._get_ccs_papers(base_url, year) - else: - raise ValueError(f"不支持的会议: {conference}") - except Exception as e: - self.logger.error(f"获取{conference.upper()} {year}论文列表失败: {e}") - raise - - async def _get_sp_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 IEEE S&P 论文列表""" - papers = [] - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/94.0.4606.81' - } - async with self.session.get(base_url, headers=headers) as response: - if response.status_code != 200: - raise Exception(f"Failed to fetch SP {year} papers list") - - soup = BeautifulSoup(response.text, 'lxml') - paper_items = soup.find_all('div', class_='article-list__item') - - for item in paper_items: - title_elem = item.find('h3', class_='article-list__title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', class_='pdf-link') - - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.computer.org{url}" - papers.append({ - 'title': title, - 'url': url - }) - - except Exception as e: - self.logger.error(f"解析 SP {year} 论文列表失败: {e}") - raise - - return papers - - async def _get_ndss_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 NDSS 论文列表""" - papers = [] - try: - url = f"{base_url}ndss{year}/accepted-papers" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/94.0.4606.81' - } - async with self.session.get(url, headers=headers) as response: - if response.status_code != 200: - raise Exception(f"Failed to fetch NDSS {year} papers list") - - soup = BeautifulSoup(response.text, 'lxml') - paper_items = soup.find_all('div', class_='paper-item') - - for item in paper_items: - title_elem = item.find('h2', class_='title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', href=lambda x: x and x.endswith('.pdf')) - - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.ndss-symposium.org{url}" - papers.append({ - 'title': title, - 'url': url - }) - - except Exception as e: - self.logger.error(f"解析 NDSS {year} 论文列表失败: {e}") - raise - - return papers - - async def _get_usenix_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 USENIX Security 论文列表""" - papers = [] - try: - url = f"{base_url}{year}/technical-sessions" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/94.0.4606.81' - } - async with self.session.get(url, headers=headers) as response: - if response.status_code != 200: - raise Exception(f"Failed to fetch USENIX {year} papers list") - - soup = BeautifulSoup(response.text, 'lxml') - paper_items = soup.find_all('div', class_='node-paper') - - for item in paper_items: - title_elem = item.find('h2', class_='node-title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', href=lambda x: x and x.endswith('.pdf')) - - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.usenix.org{url}" - papers.append({ - 'title': title, - 'url': url - }) - - except Exception as e: - self.logger.error(f"解析 USENIX {year} 论文列表失败: {e}") - raise - - return papers - - async def get_papers(self, conference: str, year: str) -> List[Dict[str, Any]]: - """ - 获取指定会议和年份的论文列表 - """ - try: - base_url = self.conference_urls.get(conference, {}).get(year) - if not base_url: - self.logger.error(f"未找到 {conference} {year} 的URL配置") - return [] - - if conference == 'ccs': - papers = await self._parse_ccs_papers(base_url, year) - elif conference == 'sp': - papers = await self._parse_sp_papers(base_url, year) - elif conference == 'ndss': - papers = await self._parse_ndss_papers(base_url, year) - elif conference == 'usenix': - papers = await self._parse_usenix_papers(base_url, year) - else: - self.logger.error(f"不支持的会议类型: {conference}") - return [] - - if papers: - print(f"✅ 成功获取 {len(papers)} 篇论文信息") - - # 显示找到的论文标题 - print(f"📋 找到的论文列表:") - for i, paper in enumerate(papers[:10]): - title = paper.get('title', '未知标题')[:60] - print(f" {i+1:2d}. {title}{'...' if len(paper.get('title', '')) > 60 else ''}") - - if len(papers) > 10: - print(f" ... 和其他 {len(papers) - 10} 篇论文") - - # 开始PDF链接验证与进度显示 - print(f"\n🔗 正在验证PDF链接有效性...") - valid_count = 0 - - for i, paper in enumerate(papers): - # 显示进度 - progress = (i + 1) / len(papers) * 100 - progress_bar = '█' * int(progress // 5) + '░' * (20 - int(progress // 5)) - print(f"\r📋 [进度: {progress_bar}] {progress:.1f}% ({i+1}/{len(papers)}) 验证: {paper.get('title', '未知标题')[:30]}...", end='', flush=True) - - # 检查URL有效性 - if isinstance(paper.get('url'), str) and paper['url'].strip(): - valid_count += 1 - - print(f"\n✅ PDF链接验证完成: {valid_count}/{len(papers)} 个有效链接") - else: - print(f"⚠️ 未找到任何论文") - - return papers - - except Exception as e: - self.logger.error(f"Error getting papers for {conference} {year}: {str(e)}") - raise - - - - async def _parse_ccs_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """ - 解析CCS论文列表的主入口函数。 - 使用持久化会话来执行所有相关请求。 - - Args: - base_url: 论文列表的基础URL - year: 会议年份 - - Returns: - 论文信息列表 - """ - papers = [] - try: - if not self.session: - raise RuntimeError("持久化会话未初始化。请在 'async with' 块中使用 PaperDownloader。") - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - paper_dois = await self._get_all_ccs_dois_from_proceedings_page(self.session, year) - if not paper_dois: - self.logger.error(f"❌ 未能为CCS {year} 获取任何论文的DOI。") - return [] - - self.logger.info(f"📚 开始通过API批量解析 {len(paper_dois)} 篇CCS论文的详细信息...") - - papers = await self._fetch_all_ccs_paper_details_via_api(self.session, paper_dois, year) - return papers - - except Exception as e: - self.logger.error(f"❌ CCS论文解析主流程错误: {str(e)}") - raise - - async def _fetch_all_ccs_paper_details_via_api(self, session: AsyncSession, dois: List[str], year: str) -> List[Dict[str, Any]]: - """ - 使用POST请求批量获取所有CCS论文的JSON数据并解析。 - """ - api_url = "https://dl.acm.org/action/exportCiteProcCitation" - headers = { - 'Host': 'dl.acm.org', - 'Cookie': '_cf_bm=12; _cfuvid=eKvDTOvVWyHDD5bNf_GLEG_fzdrvwq1g_7YIL.aZOJU-1756624678973-0.0.1.1-604800000', - 'Pragma': 'no-cache', - 'Accept': '*/*', - 'Dnt': '1', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0', - 'Sec-Ch-Ua-Platform-Version': '"19.0.0"', - 'Origin': 'https://dl.acm.org', - 'Referer': 'https://dl.acm.org/doi/proceedings/10.1145/3658644', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', - 'Priority': 'u=1, i', - } - dois_payload = ",".join(dois) - data_string = f"dois={dois_payload}&targetFile=custom-bibtex&format=json" - content_length = str(len(data_string.encode('utf-8'))) - headers['Content-Length'] = content_length - try: - self.logger.info(f"🚀 正在向 {api_url} 发送单次POST请求以获取 {len(dois)} 篇论文的JSON数据...") - response = await session.post( - api_url, - data=data_string, - headers=headers, - impersonate="chrome110", - timeout=180 - ) - if response.status_code != 200: - self.logger.error(f"❌ 批量获取JSON失败,HTTP状态码: {response.status_code}") - self.logger.error(f"响应内容: {response.text[:500]}") - debug_file = Path(f"debug_ccs_api_error_{response.status_code}.html") - debug_file.write_text(response.text, encoding='utf-8') - self.logger.info(f"🐛 已将错误响应保存到 {debug_file.absolute()} 以供调试。") - return [] - json_data_str = response.text - self.logger.info(f"✅ 成功获取JSON数据,大小: {len(json_data_str)}字节。开始解析...") - debug_json_file = Path("debug_ccs_json_response.json") - debug_json_file.write_text(json_data_str, encoding='utf-8') - self.logger.info(f"🐛 已将原始JSON响应保存到 {debug_json_file.absolute()} 以供分析。") - papers = self._parse_json_data(json_data_str, year) - self.logger.info(f"✅ 成功解析 {len(papers)}/{len(dois)} 篇论文的元数据。") - return papers - except Exception as e: - self.logger.error(f"❌ 批量获取和解析JSON数据时发生严重错误: {str(e)}") - import traceback - traceback.print_exc() - return [] - - def _parse_json_data(self, json_data_str: str, year: str) -> List[Dict[str, Any]]: - """ - 解析从ACM API返回的JSON数据。 - """ - papers = [] - try: - data = json.loads(json_data_str) - for item_dict in data.get('items', []): - for doi, details in item_dict.items(): - try: - title = details.get('title', f"未知标题 (DOI: {doi})") - authors_list = [] - for author_info in details.get('author', []): - given_name = author_info.get('given', '') - family_name = author_info.get('family', '') - authors_list.append(f"{given_name} {family_name}".strip()) - abstract = details.get('abstract', '摘要不可用') - pdf_url = f"https://dl.acm.org/doi/pdf/{doi}" - papers.append({ - 'title': title, - 'authors': authors_list, - 'abstract': abstract, - 'url': pdf_url, - 'conference': "CCS", - 'year': year - }) - except Exception as e: - self.logger.warning(f"解析单个JSON条目时出错 (DOI: {doi}): {e}") - continue - return papers - except json.JSONDecodeError as e: - self.logger.error(f"❌ JSON解析失败: {e}") - return [] - except Exception as e: - self.logger.error(f"❌ 处理JSON数据时发生未知错误: {e}") - return [] - - - def _parse_bibtex_data(self, bibtex_data: str, year: str) -> List[Dict[str, Any]]: - """ - 解析BibTeX数据字符串并返回论文列表。 - 使用更健壮的正则表达式来处理复杂的BibTeX格式。 - """ - papers = [] - # 使用更可靠的方式分割条目:按换行符后的'@'分割 - entries = re.split(r'\n@', bibtex_data) - - for entry in entries: - if not entry.strip() or not entry.startswith('inproceedings'): - continue - - try: - # 健壮的DOI提取 - doi_match = re.search(r'doi\s*=\s*\{([^}]+)\}', entry, re.IGNORECASE) - doi = doi_match.group(1).strip() if doi_match else "未知DOI" - - # 健壮的标题提取,能处理嵌套花括号 - title_match = re.search(r'title\s*=\s*\{((?:[^{}]|\{[^{}]*\})+)\}', entry, re.IGNORECASE) - title = title_match.group(1).strip().replace("{", "").replace("}", "") if title_match else f"未知标题 (DOI: {doi})" - - # 健壮的作者提取 - author_match = re.search(r'author\s*=\s*\{([^}]+)\}', entry, re.IGNORECASE) - authors_str = author_match.group(1) if author_match else "" - authors_list = [name.strip().replace("{", "").replace("}", "") for name in authors_str.split(' and ')] - - abstract = "摘要需访问论文页面查看" - pdf_url = f"https://dl.acm.org/doi/pdf/{doi}" - - papers.append({ - 'title': title, - 'authors': authors_list, - 'abstract': abstract, - 'url': pdf_url, - 'conference': "CCS", - 'year': year - }) - except Exception as e: - self.logger.warning(f"解析单个BibTeX条目时出错: {e}\n条目内容: {entry[:300]}...") - continue - return papers - - async def _get_all_ccs_dois_from_proceedings_page(self, session: AsyncSession, year: str) -> Optional[List[str]]: - """ - 获取CCS会议指定年份所有论文的DOI列表。 - 使用传入的持久化会话。 - """ - if not CURL_CFFI_AVAILABLE: - self.logger.error("❌ curl_cffi 未安装,无法执行CCS论文抓取。") - return None - - try: - short_year_str = f"'{year}" - full_year_str = f"20{year}" if len(year) == 2 else year - - proceedings_list_url = 'https://dl.acm.org/conference/ccs/proceedings' - self.logger.info(f"🌐 正在通过持久化会话访问CCS会议列表页面: {proceedings_list_url}") - - # 使用更完整的请求头来模拟浏览器行为 - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Cache-Control': 'max-age=0', - 'Connection': 'keep-alive', - 'DNT': '1', - 'Host': 'dl.acm.org', - 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', - 'Sec-Ch-Ua-Mobile': '?0', - 'Sec-Ch-Ua-Platform': '"Windows"', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.0.0' - } - - # 尝试多次,使用不同的浏览器模拟配置 - for browser in ["chrome110", "chrome99", "chrome100", "safari15_3"]: - try: - response = await session.get( - proceedings_list_url, - impersonate=browser, - headers=headers, - timeout=45 - ) - - if response.status_code == 200: - self.logger.info(f"✅ 使用 {browser} 成功访问") - break - else: - self.logger.warning(f"使用 {browser} 失败,HTTP状态码: {response.status_code}") - except Exception as e: - self.logger.warning(f"使用 {browser} 时出错: {str(e)}") - await asyncio.sleep(2) # 失败后等待一下再重试 - continue - - if response.status_code != 200: - self.logger.error(f"❌ 访问CCS会议列表页面失败,HTTP状态码: {response.status_code}") - return None - - self.logger.info("✅ 成功获取会议列表页面。") - soup = BeautifulSoup(response.text, 'html.parser') - - target_proc_url = None - proc_items = soup.select('li.conference__proceedings div.conference__title a') - for item in proc_items: - link_text = item.get_text(strip=True) - if short_year_str in link_text or full_year_str in link_text: - target_proc_url = urllib.parse.urljoin(proceedings_list_url, item['href']) - self.logger.info(f"✅ 找到 CCS {year} 会议录链接: {target_proc_url}") - break - - if not target_proc_url: - self.logger.error(f"❌ 未能在页面上找到 CCS {year} 的会议录链接。") - debug_file = Path("debug_acm_proceedings_list.html") - debug_file.write_text(response.text, encoding='utf-8') - self.logger.info(f"🐛 已将会议列表页面内容保存到 {debug_file.absolute()} 以供调试。") - return None - - self.logger.info(f"🌐 正在访问 CCS {year} 论文列表页面...") - response = await session.get(target_proc_url, impersonate="chrome110", timeout=45) - - if response.status_code != 200: - self.logger.error(f"❌ 访问 CCS {year} 论文列表页面失败,HTTP状态码: {response.status_code}") - return None - - self.logger.info(f"✅ 成功获取 CCS {year} 论文列表页面。") - soup = BeautifulSoup(response.text, 'html.parser') - - all_dois = [] - # 从隐藏的input中提取所有DOI - doi_inputs = soup.select('input.section--dois') - for doi_input in doi_inputs: - dois_str = doi_input.get('value', '') - if dois_str: - all_dois.extend(dois_str.split(',')) - - if not all_dois: - self.logger.warning(f"未能在 CCS {year} 页面提取到任何论文DOI。请检查页面结构是否已更改。") - debug_file = Path(f"debug_ccs_{year}_papers.html") - debug_file.write_text(response.text, encoding='utf-8') - self.logger.info(f"🐛 已将论文列表页面内容保存到 {debug_file.absolute()} 以供调试。") - return None - - # 去重并清洗 - unique_dois = sorted(list(set(doi.strip() for doi in all_dois if doi.strip()))) - self.logger.info(f"✅ 成功提取 {len(unique_dois)} 个唯一的 CCS {year} 论文DOI。") - return unique_dois - - except Exception as e: - self.logger.error(f"❌ 获取CCS论文DOI时发生严重错误: {str(e)}") - import traceback - traceback.print_exc() - return None - - diff --git a/src/paperbot/utils/__init__.py b/src/paperbot/utils/__init__.py index 10b92850..eb88b7eb 100644 --- a/src/paperbot/utils/__init__.py +++ b/src/paperbot/utils/__init__.py @@ -2,15 +2,12 @@ """ PaperBot 工具函数模块 -包含: +暴露当前仍在主代码路径中使用的通用工具: - logger: 日志配置 - downloader: 论文下载器 - retry_helper: 重试机制 - json_parser: JSON 解析 - text_processing: 文本处理 -- search: 搜索工具 -- analyzer: 分析工具 -- conference_*: 会议相关工具 """ from paperbot.utils.logger import setup_logger, LogContext, log_with_context diff --git a/src/paperbot/utils/acm_extractor.py b/src/paperbot/utils/acm_extractor.py deleted file mode 100644 index 409505bc..00000000 --- a/src/paperbot/utils/acm_extractor.py +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -简化版ACM论文提取器 -专为downloader模块设计 -""" - -import cloudscraper -import time -import random -import urllib3 -import json -import gzip -import io -import brotli -from bs4 import BeautifulSoup -import re - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -class ACMPaperExtractor: - """简化版ACM论文提取器""" - - def __init__(self): - self.base_url = "https://dl.acm.org" - self.scraper = None - self._init_scraper() - - def _init_scraper(self): - """初始化cloudscraper""" - self.scraper = cloudscraper.create_scraper( - browser={ - 'browser': 'chrome', - 'platform': 'windows', - 'mobile': False - }, - delay=10 - ) - - # 设置基础headers - self.scraper.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Cache-Control': 'max-age=0', - 'DNT': '1', - 'sec-ch-ua': '"Microsoft Edge";v="139", "Chromium";v="139", "Not A(Brand";v="99"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - }) - - def get_homepage(self): - """访问主页获取cookies""" - print("🏠 访问ACM主页获取cookies...") - - try: - response = self.scraper.get(self.base_url, timeout=30) - print(f"主页状态码: {response.status_code}") - - if response.status_code == 200: - print("✅ 成功获取主页cookies") - return True - else: - print(f"❌ 获取主页失败: HTTP {response.status_code}") - return False - - except Exception as e: - print(f"❌ 访问主页异常: {str(e)}") - return False - - def get_proceedings_page(self, proceedings_doi): - """访问proceedings页面获取cookies和必要的上下文""" - print(f"📂 访问CCS proceedings页面...") - - if not self.get_homepage(): - return None - - url = f"{self.base_url}/doi/proceedings/{proceedings_doi}" - - # 添加延迟 - time.sleep(random.uniform(2, 5)) - - try: - response = self.scraper.get(url, timeout=30) - print(f"Proceedings页面状态码: {response.status_code}") - - if response.status_code == 200: - print("✅ 成功访问proceedings页面") - return response.text - else: - print(f"❌ 访问proceedings页面失败: HTTP {response.status_code}") - return None - - except Exception as e: - print(f"❌ 访问proceedings页面异常: {str(e)}") - return None - - def extract_all_paper_dois(self, proceedings_content): - """从proceedings页面内容中提取所有论文DOI""" - print("🔍 从proceedings页面提取所有论文DOI...") - - if not proceedings_content: - print("❌ proceedings页面内容为空") - return [] - - try: - soup = BeautifulSoup(proceedings_content, 'html.parser') - dois = [] - - # 查找所有论文链接 - paper_links = soup.find_all('a', href=re.compile(r'/doi/10\.1145/')) - - for link in paper_links: - href = link.get('href', '') - # 从href中提取DOI - doi_match = re.search(r'/doi/([^/]+/[^/?]+)', href) - if doi_match: - doi = doi_match.group(1) - # 确保DOI格式正确且不重复 - if doi.startswith('10.1145/') and doi not in dois: - dois.append(doi) - - print(f"✅ 成功提取到 {len(dois)} 个论文DOI") - return dois - - except Exception as e: - print(f"❌ 提取论文DOI时出错: {str(e)}") - return [] - - def export_citations(self, doi_list, proceedings_doi): - """使用export citation API导出引用信息""" - print("📚 使用export citation API导出引用信息...") - - # 确保已访问主页和proceedings页面 - self.get_proceedings_page(proceedings_doi) - - # 构建API URL - api_url = f"{self.base_url}/action/exportCiteProcCitation" - - # 格式化DOI列表 - formatted_doi_list = [] - for doi in doi_list: - if doi.startswith('10.1145/'): - formatted_doi_list.append(doi) - else: - formatted_doi_list.append(f"10.1145/{doi}") - - # 构建请求数据 - data = { - 'dois': ','.join(formatted_doi_list), - 'targetFile': 'custom-bibtex', - 'format': 'bibTex' - } - - # 设置API请求headers - api_headers = { - 'X-Requested-With': 'XMLHttpRequest', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Accept': '*/*', - 'Origin': self.base_url, - 'Referer': f"{self.base_url}/doi/proceedings/{proceedings_doi}", - 'Sec-Fetch-Site': 'same-origin', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Dest': 'empty', - 'DNT': '1', - 'sec-ch-ua': '"Microsoft Edge";v="139", "Chromium";v="139", "Not A(Brand";v="99"', - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': '"Windows"', - } - - # 更新请求头 - self.scraper.headers.update(api_headers) - - # 添加延迟 - time.sleep(random.uniform(2, 4)) - - try: - response = self.scraper.post(api_url, data=data, timeout=30) - print(f"API状态码: {response.status_code}") - - if response.status_code == 200: - print("✅ API请求成功") - - # 处理响应内容 - content = None - content_encoding = response.headers.get('Content-Encoding', '').lower() - - # 根据编码类型解压 - if 'br' in content_encoding: - print("🔍 检测到Brotli压缩,正在解压...") - try: - content = brotli.decompress(response.content).decode('utf-8') - print("✅ Brotli解压成功") - except Exception as e: - print(f"⚠️ Brotli解压失败: {str(e)}") - content = response.text - elif 'gzip' in content_encoding: - print("🔍 检测到gzip压缩,正在解压...") - try: - compressed_data = io.BytesIO(response.content) - with gzip.GzipFile(fileobj=compressed_data) as gzip_file: - content = gzip_file.read().decode('utf-8') - print("✅ gzip解压成功") - except Exception as e: - print(f"⚠️ gzip解压失败: {str(e)}") - content = response.text - else: - content = response.text - print("📄 无压缩或未知压缩格式") - - # 解析JSON响应 - try: - citation_data = json.loads(content) - print("✅ 成功解析JSON响应") - return citation_data - except Exception as e: - print(f"⚠️ JSON解析失败: {str(e)}") - return None - else: - print(f"❌ API请求失败: HTTP {response.status_code}") - return None - - except Exception as e: - print(f"❌ API请求异常: {str(e)}") - return None - - def extract_paper_info(self, citation_data): - """从引用数据中提取论文信息""" - print("🔍 从引用数据中提取论文信息...") - - if not citation_data or 'items' not in citation_data: - print("❌ 无效的引用数据") - return [] - - paper_info_list = [] - items = citation_data['items'] - - print(f" 处理 {len(items)} 个条目...") - - for item in items: - # 每个item是一个字典,键是DOI - for doi, paper_data in item.items(): - try: - # 提取基本信息 - title = paper_data.get('title', 'Unknown Title') - - # 提取作者 - authors = paper_data.get('author', []) - author_names = [] - for author in authors: - if 'family' in author and 'given' in author: - author_names.append(f"{author['given']} {author['family']}") - elif 'literal' in author: - author_names.append(author['literal']) - - # 提取其他信息 - url = f"{self.base_url}/doi/{doi}" - pdf_url = f"{self.base_url}/doi/pdf/{doi}" - abstract = paper_data.get('abstract', '') - - # 创建论文信息字典 - paper_info = { - 'title': title, - 'doi': doi, - 'url': url, - 'authors': author_names, - 'pdf_url': pdf_url, - 'abstract': abstract, - 'publisher': paper_data.get('publisher', ''), - 'isbn': paper_data.get('ISBN', ''), - 'pages': paper_data.get('page', ''), - 'keywords': paper_data.get('keyword', '') - } - - paper_info_list.append(paper_info) - except Exception as e: - print(f" ⚠️ 处理条目时出错: {str(e)}") - continue - - print(f"✅ 提取到 {len(paper_info_list)} 个论文信息") - return paper_info_list \ No newline at end of file diff --git a/src/paperbot/utils/conference_downloader.py b/src/paperbot/utils/conference_downloader.py deleted file mode 100644 index c29800b2..00000000 --- a/src/paperbot/utils/conference_downloader.py +++ /dev/null @@ -1,53 +0,0 @@ -import asyncio -import aiohttp -import os -from pathlib import Path -from bs4 import BeautifulSoup -from urllib.parse import urljoin -import re - -class ConferenceDownloader: - def __init__(self, config): - self.base_url = "" - self.download_path = Path(config.get('download_path', './papers')) - self.download_path.mkdir(parents=True, exist_ok=True) - self.session = None - - async def __aenter__(self): - self.session = aiohttp.ClientSession() - return self - - async def __aexit__(self, exc_type, exc, tb): - if self.session: - await self.session.close() - - async def get_conference_papers(self, conference, year): - raise NotImplementedError("This method should be implemented by subclasses.") - - def sanitize_filename(self, filename): - """清理文件名,移除无效字符""" - return re.sub(r'[\\/*?:"<>|]', "", filename) - - async def download_paper(self, url, title, paper_index, total_papers): - if not url or not isinstance(url, str): - return {'success': False, 'error': 'Invalid URL'} - - sanitized_title = self.sanitize_filename(title) - pdf_filename = self.download_path / f"{sanitized_title}.pdf" - - if pdf_filename.exists(): - print(f"[{paper_index + 1}/{total_papers}] 📄 '{sanitized_title}' 已存在 (缓存).") - return {'success': True, 'cached': True, 'path': str(pdf_filename)} - - try: - print(f"[{paper_index + 1}/{total_papers}] 📥 开始下载: {title}") - async with self.session.get(url, timeout=aiohttp.ClientTimeout(total=300)) as response: - if response.status == 200: - content = await response.read() - with open(pdf_filename, 'wb') as f: - f.write(content) - return {'success': True, 'cached': False, 'path': str(pdf_filename), 'size': len(content)} - else: - return {'success': False, 'error': f"HTTP status {response.status}"} - except Exception as e: - return {'success': False, 'error': str(e)} diff --git a/src/paperbot/utils/conference_helpers.py b/src/paperbot/utils/conference_helpers.py deleted file mode 100644 index fe4e4d9a..00000000 --- a/src/paperbot/utils/conference_helpers.py +++ /dev/null @@ -1,177 +0,0 @@ -from typing import Optional, Dict, Any, List -from bs4 import BeautifulSoup -from curl_cffi.requests import AsyncSession -import re -import logging -import json -import asyncio -from pathlib import Path - -def setup_logger(name): - logger = logging.getLogger(name) - if not logger.handlers: - logger.setLevel(logging.INFO) - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger - -class ConferenceHelpers: - def __init__(self): - self.logger = setup_logger(__name__) - - async def get_sp_papers(self, session: AsyncSession, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 IEEE S&P 论文列表""" - papers = [] - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - } - - async with session.get(base_url, headers=headers) as response: - if response.status == 200: - soup = BeautifulSoup(await response.text(), 'html.parser') - paper_items = soup.find_all('div', class_='article-list__item') - - for item in paper_items: - title_elem = item.find('h3', class_='article-list__title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_url = await self._get_ieee_pdf_url(item) - - if pdf_url: - papers.append({ - 'title': title, - 'url': pdf_url, - 'conference': 'SP', - 'year': year - }) - - else: - raise Exception(f"Failed to fetch SP {year} papers list") - - return papers - - except Exception as e: - self.logger.error(f"解析 SP {year} 论文列表失败: {e}") - raise - - async def get_ndss_papers(self, session: AsyncSession, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 NDSS 论文列表""" - papers = [] - try: - url = f"{base_url}ndss{year}/accepted-papers" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - response = await session.get(url, headers=headers) - if response.status_code == 200: - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.find_all(['div', 'article'], class_=['paper-item', 'accepted-paper']) - - for item in paper_items: - title_elem = item.find(['h2', 'h3', 'h4'], class_=['title', 'paper-title']) - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', href=lambda x: x and x.endswith('.pdf')) - - if pdf_link and 'href' in pdf_link.attrs: - pdf_url = pdf_link['href'] - if not pdf_url.startswith('http'): - pdf_url = f"https://www.ndss-symposium.org{pdf_url}" - papers.append({ - 'title': title, - 'url': pdf_url, - 'conference': 'NDSS', - 'year': year - }) - - else: - raise Exception(f"Failed to fetch NDSS {year} papers list") - - return papers - - except Exception as e: - self.logger.error(f"解析 NDSS {year} 论文列表失败: {e}") - raise - - async def get_usenix_papers(self, session: AsyncSession, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 USENIX Security 论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - url = f"{base_url}{full_year}/technical-sessions" - - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - async with session.get(url, headers=headers) as response: - if response.status == 200: - soup = BeautifulSoup(await response.text(), 'html.parser') - paper_nodes = soup.find_all(['article', 'div'], class_=['node-paper', 'paper-item']) - - for node in paper_nodes: - title_elem = node.find(['h2', 'div'], class_=['node-title', 'field-title']) - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_url = await self._get_usenix_pdf_url(node) - - if pdf_url: - papers.append({ - 'title': title, - 'url': pdf_url, - 'conference': 'USENIX', - 'year': year - }) - - else: - raise Exception(f"Failed to fetch USENIX {year} papers list") - - return papers - - except Exception as e: - self.logger.error(f"解析 USENIX {year} 论文列表失败: {e}") - raise - - async def _get_ieee_pdf_url(self, paper_element) -> Optional[str]: - """从IEEE页面元素中提取PDF URL""" - try: - pdf_link = paper_element.find('a', href=re.compile(r'.*\.pdf')) - if pdf_link: - return pdf_link['href'] - - article_link = paper_element.find('a', href=re.compile(r'/document/')) - if article_link: - doc_id = re.search(r'/document/(\d+)', article_link['href']) - if doc_id: - return f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_id.group(1)}" - - return None - except Exception as e: - self.logger.error(f"Error extracting IEEE PDF URL: {str(e)}") - return None - - async def _get_usenix_pdf_url(self, node) -> Optional[str]: - """从USENIX论文节点获取PDF URL""" - try: - pdf_link = node.find('a', href=re.compile(r'\.pdf$')) - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.usenix.org{url}" - return url - - return None - except Exception as e: - self.logger.error(f"Error extracting USENIX PDF URL: {str(e)}") - return None diff --git a/src/paperbot/utils/conference_parsers.py b/src/paperbot/utils/conference_parsers.py deleted file mode 100644 index eacab86f..00000000 --- a/src/paperbot/utils/conference_parsers.py +++ /dev/null @@ -1,373 +0,0 @@ -import asyncio -from curl_cffi.requests import AsyncSession -from bs4 import BeautifulSoup -import re -from typing import Dict, List, Any, Optional -from pathlib import Path -import logging - -def setup_logger(name): - logger = logging.getLogger(name) - if not logger.handlers: - logger.setLevel(logging.INFO) - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger - -class ConferenceParsers: - def __init__(self): - self.logger = setup_logger(__name__) - - async def parse_ndss_papers(self, base_url: str, year: str, session: AsyncSession) -> List[Dict[str, Any]]: - """解析NDSS论文列表 - 优化版本带进度显示""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - url = f"{base_url}ndss{full_year}/accepted-papers/" - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - print(f"🌐 访问 NDSS {year} 会议页面...") - - max_retries = 3 - for attempt in range(max_retries): - try: - async with session.get(url, headers=headers) as response: - print(f"⚡ 尝试 {attempt + 1}/{max_retries}: HTTP {response.status}") - - if response.status == 200: - print(f"📝 正在解析页面内容...") - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 查找NDSS论文容器 - paper_containers = soup.find_all('div', class_='tag-box rel-paper') - print(f"📚 找到 {len(paper_containers)} 个论文容器") - - if not paper_containers: - print(f"⚠️ 未找到论文容器,尝试其他选择器...") - # 尝试其他可能的选择器 - paper_containers = soup.find_all('div', class_='paper') or soup.find_all('article') - print(f"🔄 备用选择器找到 {len(paper_containers)} 个容器") - - # 处理论文容器并显示简单进度 - for idx, container in enumerate(paper_containers): - if idx % 5 == 0 or idx == len(paper_containers) - 1: # 每5个显示一次进度 - progress = (idx + 1) / len(paper_containers) * 100 - print(f"� 解析进度: {progress:.1f}% ({idx+1}/{len(paper_containers)})") - - try: - # 提取标题 - 尝试多种选择器 - title_elem = (container.find('h3', class_='blog-post-title') or - container.find('h3') or - container.find('h2') or - container.find('h1')) - - if not title_elem: - continue - - title = title_elem.get_text().strip() - - # 显示找到的论文标题 - print(f"📄 [{idx+1}/{len(paper_containers)}] 找到论文: {title[:70]}{'...' if len(title) > 70 else ''}") - - # 提取作者信息 - author_elem = container.find('p') - authors_text = author_elem.get_text().strip() if author_elem else '' - authors = [author.strip() for author in authors_text.split(',')] if authors_text else [] - - # 提取详情页链接 - detail_link = (container.find('a', class_='paper-link-abs') or - container.find('a', href=True)) - detail_url = detail_link.get('href') if detail_link else '' - - # 提取PDF链接 - pdf_url = '' - if detail_url: - # 将详情页URL转换为绝对URL - if not detail_url.startswith('http'): - if detail_url.startswith('//'): - detail_url = f'https:{detail_url}' - elif detail_url.startswith('/'): - detail_url = f'https://www.ndss-symposium.org{detail_url}' - else: - detail_url = f'https://www.ndss-symposium.org/{detail_url}' - pdf_url = await self._get_ndss_pdf_from_detail_page(session, detail_url) - - paper_info = { - 'title': title, - 'authors': authors, - 'abstract': '', - 'url': pdf_url, - 'detail_url': detail_url, - 'doi': '' - } - - if title and len(title) > 10: - papers.append(paper_info) - except Exception as e: - self.logger.warning(f"Error parsing paper container {idx}: {str(e)}") - continue - - print(f"\n✅ 成功解析 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 尝试 {attempt + 1} 失败: {str(e)}") - if attempt < max_retries - 1: - delay = 3 * (attempt + 1) - print(f"⏳ 等待 {delay} 秒后重试...") - await asyncio.sleep(delay) - else: - return [] - - print("❌ 所有重试均失败") - return [] - - async def _get_ndss_pdf_from_detail_page(self, session: AsyncSession, detail_url: str) -> str: - """从NDSS论文详情页提取PDF链接 - 优化版本""" - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - async with session.get(detail_url, headers=headers) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 尝试多种PDF选择器 - pdf_selectors = [ - ('a', {'href': lambda x: x and x.endswith('.pdf')}), - ('a', {'href': lambda x: x and 'paper' in x.lower() and '.pdf' in x.lower()}), - ('a', {'class': 'file-pdf'}), - ('a', {'class': 'download-pdf'}), - ('a', {'title': lambda x: x and 'pdf' in x.lower()}) - ] - - for tag, attrs in pdf_selectors: - pdf_link = soup.find(tag, attrs) - if pdf_link and 'href' in pdf_link.attrs: - pdf_url = pdf_link['href'] - # 处理相对URL - if not pdf_url.startswith('http'): - if pdf_url.startswith('//'): - pdf_url = f'https:{pdf_url}' - elif pdf_url.startswith('/'): - pdf_url = f'https://www.ndss-symposium.org{pdf_url}' - else: - pdf_url = f'https://www.ndss-symposium.org/{pdf_url}' - return pdf_url - - return '' - - except Exception as e: - self.logger.warning(f"Error getting PDF from detail page: {str(e)}") - return '' - - async def parse_usenix_papers(self, base_url: str, year: str, session: AsyncSession) -> List[Dict[str, Any]]: - """解析USENIX Security论文列表""" - papers: List[Dict[str, Any]] = [] - url = f"https://www.usenix.org/conference/usenixsecurity{year}/technical-sessions" - - print(f"🌐 正在解析 USENIX Security {year} 论文列表...") - - headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/91.0.4472.124 Safari/537.36" - ) - } - - try: - response = await session.get(url, headers=headers) - status_code = getattr(response, "status_code", None) - - if status_code == 200: - print("✅ 页面访问成功,开始解析...") - html = getattr(response, "text", "") - soup = BeautifulSoup(html, "html.parser") - - # 使用多种选择器查找论文节点 - paper_nodes = soup.find_all(["article", "div"], class_=["node-paper", "paper-item"]) - print(f"📚 找到 {len(paper_nodes)} 个论文节点") - - if not paper_nodes: - print("⚠️ 未找到论文节点,尝试备用选择器...") - paper_nodes = soup.find_all(["div", "article"], class_=["paper", "technical-paper"]) - - for idx, node in enumerate(paper_nodes, 1): - try: - title_elem = ( - node.find(["h2", "h3"], class_=["node-title", "paper-title"]) - or node.find("div", class_="field-title") - ) - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_url = await self._get_usenix_pdf_url(node) - if not pdf_url: - continue - - papers.append( - { - "title": title, - "url": pdf_url, - "conference": "USENIX", - "year": year, - } - ) - print( - f"\r📄 处理论文 {idx}/{len(paper_nodes)}: {title[:50]}...", - end="", - flush=True, - ) - except Exception: - continue - - print(f"\n✅ USENIX解析完成: {len(papers)} 篇论文") - return papers - - if status_code == 404: - print(f"❌ USENIX {year} 页面不存在") - return [] - - raise Exception(f"HTTP {status_code}") - - except Exception as e: - print(f"❌ USENIX解析错误: {str(e)}") - return [] - - async def _get_usenix_pdf_url(self, node) -> Optional[str]: - """从USENIX论文节点获取PDF链接""" - try: - # 直接查找PDF链接 - pdf_link = node.find('a', href=re.compile(r'\.pdf$', re.I)) - if pdf_link and pdf_link.get('href'): - pdf_url = pdf_link['href'] - return self._complete_usenix_url(pdf_url) - - # 查找presentation链接 - pres_link = node.find('a', href=re.compile(r'/presentation/', re.I)) - if pres_link and pres_link.get('href'): - pres_url = pres_link['href'] - return self._complete_usenix_url(pres_url) - - return None - - except Exception as e: - print(f"⚠️ PDF链接提取失败: {str(e)}") - return None - - def _complete_usenix_url(self, url: str) -> str: - """补全USENIX URL""" - if not url: - return '' - - if url.startswith('http'): - return url - elif url.startswith('//'): - return f"https:{url}" - elif url.startswith('/'): - return f"https://www.usenix.org{url}" - else: - return f"https://www.usenix.org/{url}" - - async def parse_sp_papers(self, base_url: str, year: str, session: AsyncSession) -> List[Dict[str, Any]]: - """解析IEEE S&P论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - print(f"🌐 正在获取 IEEE S&P {full_year} 论文列表...") - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1' - } - - try: - response = await session.get(base_url, headers=headers) - if response.status_code == 200: - print(f"📝 正在解析页面内容...") - html = response.text - soup = BeautifulSoup(html, 'html.parser') - - # 查找论文项 - paper_items = soup.find_all(['div', 'article'], class_=['paper-item', 'article-item']) - print(f"📚 找到 {len(paper_items)} 个论文项") - - if not paper_items: - paper_items = soup.find_all(['div', 'article'], class_=['paper', 'article']) - - for idx, item in enumerate(paper_items, 1): - try: - # 查找标题 - title_elem = item.find(['h3', 'h2'], class_=['paper-title', 'article-title']) - if not title_elem: - continue - - title = title_elem.text.strip() - - # 查找PDF链接 - pdf_url = await self._get_ieee_pdf_url(item) - if pdf_url: - papers.append({ - 'title': title, - 'url': pdf_url, - 'conference': 'SP', - 'year': year - }) - - print(f"\r📄 处理论文 {idx}/{len(paper_items)}: {title[:50]}...", end='', flush=True) - except Exception as e: - print(f"\n⚠️ 处理论文时出错: {str(e)}") - continue - - except Exception as e: - print(f"\n⚠️ 处理论文时出错: {str(e)}") - continue - - print(f"\n✅ SP解析完成: {len(papers)} 篇论文") - return papers - - else: - raise Exception(f"HTTP {response.status_code}") - - except Exception as e: - print(f"❌ SP解析错误: {str(e)}") - return [] - - async def _get_ieee_pdf_url(self, paper_element) -> Optional[str]: - """从IEEE论文元素中提取PDF URL""" - try: - # 直接查找PDF链接 - pdf_link = paper_element.find('a', href=re.compile(r'\.pdf$', re.I)) - if pdf_link and pdf_link.get('href'): - return pdf_link['href'] - - # 查找文章链接并构造PDF URL - article_link = paper_element.find('a', href=re.compile(r'/document/')) - if article_link and article_link.get('href'): - doc_match = re.search(r'/document/(\d+)', article_link['href']) - if doc_match: - doc_id = doc_match.group(1) - return f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_id}" - - return None - - except Exception as e: - print(f"⚠️ PDF链接提取失败: {str(e)}") - return None diff --git a/src/paperbot/utils/conference_parsers_new.py b/src/paperbot/utils/conference_parsers_new.py deleted file mode 100644 index eacab86f..00000000 --- a/src/paperbot/utils/conference_parsers_new.py +++ /dev/null @@ -1,373 +0,0 @@ -import asyncio -from curl_cffi.requests import AsyncSession -from bs4 import BeautifulSoup -import re -from typing import Dict, List, Any, Optional -from pathlib import Path -import logging - -def setup_logger(name): - logger = logging.getLogger(name) - if not logger.handlers: - logger.setLevel(logging.INFO) - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger - -class ConferenceParsers: - def __init__(self): - self.logger = setup_logger(__name__) - - async def parse_ndss_papers(self, base_url: str, year: str, session: AsyncSession) -> List[Dict[str, Any]]: - """解析NDSS论文列表 - 优化版本带进度显示""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - url = f"{base_url}ndss{full_year}/accepted-papers/" - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - print(f"🌐 访问 NDSS {year} 会议页面...") - - max_retries = 3 - for attempt in range(max_retries): - try: - async with session.get(url, headers=headers) as response: - print(f"⚡ 尝试 {attempt + 1}/{max_retries}: HTTP {response.status}") - - if response.status == 200: - print(f"📝 正在解析页面内容...") - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 查找NDSS论文容器 - paper_containers = soup.find_all('div', class_='tag-box rel-paper') - print(f"📚 找到 {len(paper_containers)} 个论文容器") - - if not paper_containers: - print(f"⚠️ 未找到论文容器,尝试其他选择器...") - # 尝试其他可能的选择器 - paper_containers = soup.find_all('div', class_='paper') or soup.find_all('article') - print(f"🔄 备用选择器找到 {len(paper_containers)} 个容器") - - # 处理论文容器并显示简单进度 - for idx, container in enumerate(paper_containers): - if idx % 5 == 0 or idx == len(paper_containers) - 1: # 每5个显示一次进度 - progress = (idx + 1) / len(paper_containers) * 100 - print(f"� 解析进度: {progress:.1f}% ({idx+1}/{len(paper_containers)})") - - try: - # 提取标题 - 尝试多种选择器 - title_elem = (container.find('h3', class_='blog-post-title') or - container.find('h3') or - container.find('h2') or - container.find('h1')) - - if not title_elem: - continue - - title = title_elem.get_text().strip() - - # 显示找到的论文标题 - print(f"📄 [{idx+1}/{len(paper_containers)}] 找到论文: {title[:70]}{'...' if len(title) > 70 else ''}") - - # 提取作者信息 - author_elem = container.find('p') - authors_text = author_elem.get_text().strip() if author_elem else '' - authors = [author.strip() for author in authors_text.split(',')] if authors_text else [] - - # 提取详情页链接 - detail_link = (container.find('a', class_='paper-link-abs') or - container.find('a', href=True)) - detail_url = detail_link.get('href') if detail_link else '' - - # 提取PDF链接 - pdf_url = '' - if detail_url: - # 将详情页URL转换为绝对URL - if not detail_url.startswith('http'): - if detail_url.startswith('//'): - detail_url = f'https:{detail_url}' - elif detail_url.startswith('/'): - detail_url = f'https://www.ndss-symposium.org{detail_url}' - else: - detail_url = f'https://www.ndss-symposium.org/{detail_url}' - pdf_url = await self._get_ndss_pdf_from_detail_page(session, detail_url) - - paper_info = { - 'title': title, - 'authors': authors, - 'abstract': '', - 'url': pdf_url, - 'detail_url': detail_url, - 'doi': '' - } - - if title and len(title) > 10: - papers.append(paper_info) - except Exception as e: - self.logger.warning(f"Error parsing paper container {idx}: {str(e)}") - continue - - print(f"\n✅ 成功解析 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 尝试 {attempt + 1} 失败: {str(e)}") - if attempt < max_retries - 1: - delay = 3 * (attempt + 1) - print(f"⏳ 等待 {delay} 秒后重试...") - await asyncio.sleep(delay) - else: - return [] - - print("❌ 所有重试均失败") - return [] - - async def _get_ndss_pdf_from_detail_page(self, session: AsyncSession, detail_url: str) -> str: - """从NDSS论文详情页提取PDF链接 - 优化版本""" - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - async with session.get(detail_url, headers=headers) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 尝试多种PDF选择器 - pdf_selectors = [ - ('a', {'href': lambda x: x and x.endswith('.pdf')}), - ('a', {'href': lambda x: x and 'paper' in x.lower() and '.pdf' in x.lower()}), - ('a', {'class': 'file-pdf'}), - ('a', {'class': 'download-pdf'}), - ('a', {'title': lambda x: x and 'pdf' in x.lower()}) - ] - - for tag, attrs in pdf_selectors: - pdf_link = soup.find(tag, attrs) - if pdf_link and 'href' in pdf_link.attrs: - pdf_url = pdf_link['href'] - # 处理相对URL - if not pdf_url.startswith('http'): - if pdf_url.startswith('//'): - pdf_url = f'https:{pdf_url}' - elif pdf_url.startswith('/'): - pdf_url = f'https://www.ndss-symposium.org{pdf_url}' - else: - pdf_url = f'https://www.ndss-symposium.org/{pdf_url}' - return pdf_url - - return '' - - except Exception as e: - self.logger.warning(f"Error getting PDF from detail page: {str(e)}") - return '' - - async def parse_usenix_papers(self, base_url: str, year: str, session: AsyncSession) -> List[Dict[str, Any]]: - """解析USENIX Security论文列表""" - papers: List[Dict[str, Any]] = [] - url = f"https://www.usenix.org/conference/usenixsecurity{year}/technical-sessions" - - print(f"🌐 正在解析 USENIX Security {year} 论文列表...") - - headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/91.0.4472.124 Safari/537.36" - ) - } - - try: - response = await session.get(url, headers=headers) - status_code = getattr(response, "status_code", None) - - if status_code == 200: - print("✅ 页面访问成功,开始解析...") - html = getattr(response, "text", "") - soup = BeautifulSoup(html, "html.parser") - - # 使用多种选择器查找论文节点 - paper_nodes = soup.find_all(["article", "div"], class_=["node-paper", "paper-item"]) - print(f"📚 找到 {len(paper_nodes)} 个论文节点") - - if not paper_nodes: - print("⚠️ 未找到论文节点,尝试备用选择器...") - paper_nodes = soup.find_all(["div", "article"], class_=["paper", "technical-paper"]) - - for idx, node in enumerate(paper_nodes, 1): - try: - title_elem = ( - node.find(["h2", "h3"], class_=["node-title", "paper-title"]) - or node.find("div", class_="field-title") - ) - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_url = await self._get_usenix_pdf_url(node) - if not pdf_url: - continue - - papers.append( - { - "title": title, - "url": pdf_url, - "conference": "USENIX", - "year": year, - } - ) - print( - f"\r📄 处理论文 {idx}/{len(paper_nodes)}: {title[:50]}...", - end="", - flush=True, - ) - except Exception: - continue - - print(f"\n✅ USENIX解析完成: {len(papers)} 篇论文") - return papers - - if status_code == 404: - print(f"❌ USENIX {year} 页面不存在") - return [] - - raise Exception(f"HTTP {status_code}") - - except Exception as e: - print(f"❌ USENIX解析错误: {str(e)}") - return [] - - async def _get_usenix_pdf_url(self, node) -> Optional[str]: - """从USENIX论文节点获取PDF链接""" - try: - # 直接查找PDF链接 - pdf_link = node.find('a', href=re.compile(r'\.pdf$', re.I)) - if pdf_link and pdf_link.get('href'): - pdf_url = pdf_link['href'] - return self._complete_usenix_url(pdf_url) - - # 查找presentation链接 - pres_link = node.find('a', href=re.compile(r'/presentation/', re.I)) - if pres_link and pres_link.get('href'): - pres_url = pres_link['href'] - return self._complete_usenix_url(pres_url) - - return None - - except Exception as e: - print(f"⚠️ PDF链接提取失败: {str(e)}") - return None - - def _complete_usenix_url(self, url: str) -> str: - """补全USENIX URL""" - if not url: - return '' - - if url.startswith('http'): - return url - elif url.startswith('//'): - return f"https:{url}" - elif url.startswith('/'): - return f"https://www.usenix.org{url}" - else: - return f"https://www.usenix.org/{url}" - - async def parse_sp_papers(self, base_url: str, year: str, session: AsyncSession) -> List[Dict[str, Any]]: - """解析IEEE S&P论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - print(f"🌐 正在获取 IEEE S&P {full_year} 论文列表...") - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1' - } - - try: - response = await session.get(base_url, headers=headers) - if response.status_code == 200: - print(f"📝 正在解析页面内容...") - html = response.text - soup = BeautifulSoup(html, 'html.parser') - - # 查找论文项 - paper_items = soup.find_all(['div', 'article'], class_=['paper-item', 'article-item']) - print(f"📚 找到 {len(paper_items)} 个论文项") - - if not paper_items: - paper_items = soup.find_all(['div', 'article'], class_=['paper', 'article']) - - for idx, item in enumerate(paper_items, 1): - try: - # 查找标题 - title_elem = item.find(['h3', 'h2'], class_=['paper-title', 'article-title']) - if not title_elem: - continue - - title = title_elem.text.strip() - - # 查找PDF链接 - pdf_url = await self._get_ieee_pdf_url(item) - if pdf_url: - papers.append({ - 'title': title, - 'url': pdf_url, - 'conference': 'SP', - 'year': year - }) - - print(f"\r📄 处理论文 {idx}/{len(paper_items)}: {title[:50]}...", end='', flush=True) - except Exception as e: - print(f"\n⚠️ 处理论文时出错: {str(e)}") - continue - - except Exception as e: - print(f"\n⚠️ 处理论文时出错: {str(e)}") - continue - - print(f"\n✅ SP解析完成: {len(papers)} 篇论文") - return papers - - else: - raise Exception(f"HTTP {response.status_code}") - - except Exception as e: - print(f"❌ SP解析错误: {str(e)}") - return [] - - async def _get_ieee_pdf_url(self, paper_element) -> Optional[str]: - """从IEEE论文元素中提取PDF URL""" - try: - # 直接查找PDF链接 - pdf_link = paper_element.find('a', href=re.compile(r'\.pdf$', re.I)) - if pdf_link and pdf_link.get('href'): - return pdf_link['href'] - - # 查找文章链接并构造PDF URL - article_link = paper_element.find('a', href=re.compile(r'/document/')) - if article_link and article_link.get('href'): - doc_match = re.search(r'/document/(\d+)', article_link['href']) - if doc_match: - doc_id = doc_match.group(1) - return f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_id}" - - return None - - except Exception as e: - print(f"⚠️ PDF链接提取失败: {str(e)}") - return None diff --git a/src/paperbot/utils/downloader - ccs.py b/src/paperbot/utils/downloader - ccs.py deleted file mode 100644 index f544d741..00000000 --- a/src/paperbot/utils/downloader - ccs.py +++ /dev/null @@ -1,1093 +0,0 @@ -# securipaperbot/utils/downloader.py - -from typing import Dict, List, Any, Optional -import aiohttp -import asyncio -import httpx -from pathlib import Path -import urllib.parse -from bs4 import BeautifulSoup -import re -import json -import time -import random -from datetime import datetime -import logging -import traceback - -# 添加动态cookie获取支持 -import traceback -try: - # curl_cffi 0.5.x 版本中, AsyncSession 位于 requests 模块下 - from curl_cffi.requests import AsyncSession - CURL_CFFI_AVAILABLE = True -except ImportError: - from typing import Any as AsyncSession # Mock for type hinting - CURL_CFFI_AVAILABLE = False - print("❌ 'curl_cffi' 导入失败。详细错误信息如下:") - traceback.print_exc() - print("警告: curl_cffi 未安装或无法加载,动态cookie获取功能(如ACM)将受限") - -try: - import cloudscraper - CLOUDSCRAPER_AVAILABLE = True -except ImportError: - CLOUDSCRAPER_AVAILABLE = False - print("警告: cloudscraper 未安装,动态cookie获取功能(如ACM)将受限") - - -# 使用标准日志,避免相对导入问题 -def setup_logger(name): - logger = logging.getLogger(name) - if not logger.handlers: - logger.setLevel(logging.INFO) - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger - - -from paperbot.utils.conference_downloader import ConferenceDownloader - -class PaperDownloader: - """论文下载工具类 - 优化版本,使用持久化会话""" - - # 会议基本信息配置 - CONFERENCE_INFO = { - 'sp': { - 'base_url': 'https://ieeexplore.ieee.org/xpl/conhome/1000487/all-proceedings', - 'headers': { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0' - } - }, - 'ndss': { - 'base_url': 'https://www.ndss-symposium.org', - 'headers': { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0' - } - }, - 'usenix': { - 'base_url': 'https://www.usenix.org/conference/usenixsecurity', - 'headers': { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0' - } - } - } - - async def _download_with_retry(self, url: str) -> Optional[bytes]: - """ - 智能下载实现,带自动重试和反爬处理。 - - Args: - url (str): 要下载的URL - - Returns: - Optional[bytes]: 下载的内容,失败返回None - """ - # 验证并确保会话可用 - if not self.session: - try: - self.logger.info("正在重新创建持久化会话...") - self.session = AsyncSession() - except Exception as e: - self.logger.error(f"创建持久化会话失败: {e}") - return None - - last_error = None - content = None - - for attempt in range(1, self.max_retries + 1): - try: - self.logger.info(f"下载尝试 {attempt}/{self.max_retries}: {url}") - - # 配置特殊headers以绕过反爬 - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - # 验证会话状态 - if not self.session or getattr(self.session, '_closed', False): - self.logger.warning("会话已关闭,正在重新创建...") - self.session = AsyncSession() - - # 使用curl_cffi的持久化会话和浏览器仿真 - response = await self.session.get( - url, - impersonate="chrome110", - headers=headers, - timeout=60 - ) - - # 检查HTTP状态码 - if response.status_code == 403: - self.logger.warning(f"遇到403 Forbidden,可能是反爬限制 (尝试 {attempt}/{self.max_retries})") - await asyncio.sleep(self.retry_delay * attempt) # 指数退避 - continue - - elif response.status_code == 429: - self.logger.warning(f"遇到429 Too Many Requests,开始等待 (尝试 {attempt}/{self.max_retries})") - await asyncio.sleep(self.retry_delay * 2 * attempt) # 更长的等待 - continue - - elif response.status_code != 200: - self.logger.warning(f"HTTP {response.status_code} (尝试 {attempt}/{self.max_retries})") - await asyncio.sleep(self.retry_delay) - continue - - # 获取响应内容 - content = response.content - - # 验证内容 - if not content or len(content) < 1024: # 小于1KB可能是错误页面 - self.logger.warning(f"响应内容过小: {len(content) if content else 0} bytes") - continue - - # 对于PDF,验证文件头 - if url.lower().endswith('.pdf') and not content.startswith(b'%PDF'): - self.logger.warning("响应不是有效的PDF格式") - continue - - self.logger.info(f"✅ 成功下载: {len(content)} bytes") - return content - - except Exception as e: - last_error = e - self.logger.warning(f"下载出错 (尝试 {attempt}/{self.max_retries}): {e}") - await asyncio.sleep(self.retry_delay) - continue - - # 所有重试都失败 - if last_error: - self.logger.error(f"❌ 下载失败,已达到最大重试次数。最后错误: {last_error}") - else: - self.logger.error("❌ 下载失败,未获得有效内容") - return None - - def _sanitize_filename(self, filename: str) -> str: - """清理并规范化文件名,移除非法字符""" - # 替换 Windows 文件系统不允许的字符 - invalid_chars = r'[\\/:"*?<>|]+' - filename = re.sub(invalid_chars, '_', filename) - - # 将连续的空白字符替换为单个空格 - filename = re.sub(r'\s+', ' ', filename) - - # 去除首尾空白 - filename = filename.strip() - - # 如果文件名为空,使用默认名称 - if not filename: - filename = f"paper_{int(time.time())}" - - # 限制文件名长度(Windows 最大路径长度为 260 字符) - max_length = 200 # 留一些余地给路径和扩展名 - if len(filename) > max_length: - filename = filename[:max_length-3] + "..." - - return filename - - def __init__(self, config: Optional[Dict[str, Any]] = None): - self.config = config or {} - self.logger = setup_logger(__name__) - self.download_path = Path(self.config.get('download_path', './papers')) - self.download_path.mkdir(parents=True, exist_ok=True) - - self.session: Optional[AsyncSession] = None - - # 配置下载重试参数 - self.max_retries = self.config.get('max_retries', 3) - self.retry_delay = self.config.get('retry_delay', 3) - - # 并发控制 - max_concurrent = 1 - self.semaphore = asyncio.Semaphore(max_concurrent) - - - # 会议URL模板 - self.conference_urls = { - 'ccs': 'https://dl.acm.org/doi/proceedings/', - 'sp': 'https://ieeexplore.ieee.org/xpl/conhome/', - 'ndss': 'https://www.ndss-symposium.org/', - 'usenix': 'https://www.usenix.org/conference/' - } - - async def __aenter__(self): - """创建并返回一个持久化的 curl_cffi 会话.""" - try: - if self.session and not getattr(self.session, '_closed', False): - self.logger.info("使用现有的持久化会话...") - return self - - self.logger.info("正在创建新的持久化会话...") - self.session = AsyncSession() - return self - except Exception as e: - self.logger.error(f"创建持久化会话失败: {e}") - raise - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """关闭持久化会话.""" - try: - if self.session: - # 检查会话是否已经关闭 - is_closed = getattr(self.session, '_closed', True) - - if not is_closed and hasattr(self.session, 'close'): - try: - self.logger.info("正在关闭持久化会话...") - await self.session.close() - except Exception as e: - self.logger.warning(f"关闭会话时出现异常: {e}") - else: - self.logger.info("会话已经关闭,无需再次关闭") - except Exception as e: - self.logger.warning(f"处理会话关闭时出现异常: {e}") - finally: - # 确保会话对象被清理 - self.session = None - - async def download_paper(self, url: str, title: str, paper_index: int = 0, total_papers: int = 0) -> Dict[str, Any]: - """下载单篇论文 - 优化版本""" - async with self.semaphore: - try: - # 生成文件名 - 为IEEE论文添加特殊前缀 - safe_title = self._sanitize_filename(title) - - # 使用简化的文件名:只使用论文标题 - filename = f"{safe_title}.pdf" - - file_path = self.download_path / filename - - # 显示下载进度(与NDSS/USENIX保持一致) - if total_papers > 0: - progress = (paper_index + 1) / total_papers * 100 - print(f"💾 [{paper_index+1}/{total_papers}] 下载: {title[:50]}{'...' if len(title) > 50 else ''}") - - # 检查是否已下载并验证文件 - if file_path.exists(): - # 验证文件大小,过小的文件可能是错误页面 - file_size = file_path.stat().st_size - if file_size > 1024: # 大于1KB认为有效 - return { - 'success': True, - 'path': str(file_path), - 'cached': True, - 'size': file_size - } - else: - # 删除无效文件 - file_path.unlink() - self.logger.warning(f"Removed invalid cached file: {file_path}") - - # 下载论文 - content = await self._download_with_retry(url) - if content: - # 验证下载内容 - if len(content) < 1024: - raise Exception(f"Downloaded content too small ({len(content)} bytes), likely an error page") - - # 保存文件 - file_path.write_bytes(content) - file_size = len(content) - - return { - 'success': True, - 'path': str(file_path), - 'cached': False, - 'size': file_size - } - else: - raise Exception("Failed to download paper - no content received") - - except Exception as e: - self.logger.error(f"Error downloading paper {title}: {str(e)}") - return { - 'success': False, - 'error': str(e) - } - - async def _parse_sp_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """解析IEEE S&P论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - try: - print(f"🌐 正在获取 IEEE S&P {full_year} 论文列表...") - conf_info = self.CONFERENCE_INFO['sp'] - api_url = f"{conf_info['base_url']}" - - # 使用会话发送请求 - if not self.session: - self.session = AsyncSession() - - response = await self.session.get( - base_url, - headers=conf_info['headers'], - impersonate="chrome110" - ) - - if response.status_code != 200: - raise Exception(f"获取会议页面失败: HTTP {response.status_code}") - - # 解析页面内容 - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.select('div.paper-item') - - for item in paper_items: - title_elem = item.select_one('h3.paper-title') - if not title_elem: - continue - - title = title_elem.text.strip() - url = item.select_one('a[href*=".pdf"]') - if not url: - continue - - pdf_url = url['href'] - if not pdf_url.startswith('http'): - pdf_url = f"https://ieeexplore.ieee.org{pdf_url}" - - papers.append({ - 'title': title, - 'url': pdf_url - }) - - print(f"✅ 找到 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 获取 IEEE S&P {full_year} 论文列表失败: {str(e)}") - return [] - - async def _parse_ndss_papers(self, year: str) -> List[Dict[str, Any]]: - """解析 NDSS 论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - try: - print(f"📚 正在获取 NDSS {full_year} 论文列表...") - conf_info = self.CONFERENCE_INFO['ndss'] - base_url = f"{conf_info['base_url']}/ndss{year}/accepted-papers" - - # 使用会话发送请求 - if not self.session: - self.session = AsyncSession() - - response = await self.session.get( - base_url, - headers=conf_info['headers'], - impersonate="chrome110" - ) - - if response.status_code != 200: - raise Exception(f"获取会议页面失败: HTTP {response.status_code}") - - # 解析页面内容 - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.select('div.paper-item, div.accepted-paper') - - for item in paper_items: - title_elem = item.select_one('h3.paper-title, h4.paper-title, div.paper-title') - if not title_elem: - continue - - title = title_elem.text.strip() - url = item.select_one('a[href*=".pdf"]') - if not url: - continue - - pdf_url = url['href'] - if not pdf_url.startswith('http'): - pdf_url = f"{conf_info['base_url']}{pdf_url}" - - papers.append({ - 'title': title, - 'url': pdf_url - }) - - print(f"✅ 找到 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 获取 NDSS {full_year} 论文列表失败: {str(e)}") - return [] - - async def _parse_usenix_papers(self, year: str) -> List[Dict[str, Any]]: - """解析 USENIX Security 论文列表""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - try: - print(f"📚 正在获取 USENIX Security {full_year} 论文列表...") - conf_info = self.CONFERENCE_INFO['usenix'] - base_url = f"{conf_info['base_url']}{full_year}/technical-sessions" - - # 使用会话发送请求 - if not self.session: - self.session = AsyncSession() - - response = await self.session.get( - base_url, - headers=conf_info['headers'], - impersonate="chrome110" - ) - - if response.status_code != 200: - raise Exception(f"获取会议页面失败: HTTP {response.status_code}") - - # 解析页面内容 - soup = BeautifulSoup(response.text, 'html.parser') - paper_items = soup.select('div.paper-item, div.node-paper') - - for item in paper_items: - title_elem = item.select_one('h2.node-title, div.field-title') - if not title_elem: - continue - - title = title_elem.text.strip() - url = item.select_one('a[href*=".pdf"]') - if not url: - continue - - pdf_url = url['href'] - if not pdf_url.startswith('http'): - pdf_url = f"https://www.usenix.org{pdf_url}" - - papers.append({ - 'title': title, - 'url': pdf_url - }) - - print(f"✅ 找到 {len(papers)} 篇论文") - return papers - - except Exception as e: - print(f"❌ 获取 USENIX Security {full_year} 论文列表失败: {str(e)}") - return [] - - async def get_conference_papers(self, conference: str, year: str) -> List[Dict[str, Any]]: - """获取会议论文列表 - 带进度显示""" - try: - if not conference in self.conference_urls: - raise ValueError(f"不支持的会议: {conference}") - - conf_info = self.CONFERENCE_INFO.get(conference) - base_url = self.conference_urls.get(conference) - - papers = [] - print(f"🔍 正在获取 {conference.upper()} {year} 论文列表...") - - if conference == 'ccs': - # 使用现有的CCS下载逻辑 - papers = await self._parse_ccs_papers(base_url, year) - else: - # 使用新的会议下载器处理其他会议 - if not self.session: - self.session = AsyncSession() - - conf_downloader = ConferenceDownloader(self.session, self.logger) - papers = await conf_downloader.parse_papers(conference, base_url, year) - - if papers: - print(f"✅ 成功获取 {len(papers)} 篇论文信息") - # 显示找到的论文标题 - print(f"📋 找到的论文列表:") - for i, paper in enumerate(papers[:10]): - title = paper.get('title', '未知标题')[:60] - print(f" {i+1:2d}. {title}{'...' if len(paper.get('title', '')) > 60 else ''}") - - if len(papers) > 10: - print(f" ... 和其他 {len(papers) - 10} 篇论文") - else: - print(f"⚠️ 未找到任何论文") - - return papers - - except Exception as e: - self.logger.error(f"获取论文列表失败: {str(e)}") - raise - try: - conf_info = self.CONFERENCE_INFO.get(conference) - if not conf_info and conference != 'ccs': - raise ValueError(f"不支持的会议: {conference}") - - papers = [] - print(f"🔍 正在获取 {conference.upper()} {year} 论文列表...") - - # 根据会议类型选择相应的解析方法 - if conference == 'ccs': - papers = await self._parse_ccs_papers(self.conference_urls[conference], year) - elif conference == 'sp': - papers = await self._parse_sp_papers(year) - elif conference == 'ndss': - papers = await self._parse_ndss_papers(year) - elif conference == 'usenix': - papers = await self._parse_usenix_papers(year) - - if papers: - print(f"✅ 成功获取 {len(papers)} 篇论文信息") - - # 显示找到的论文标题 - print(f"📋 找到的论文列表:") - for i, paper in enumerate(papers[:10]): - title = paper.get('title', '未知标题')[:60] - print(f" {i+1:2d}. {title}{'...' if len(paper.get('title', '')) > 60 else ''}") - - if len(papers) > 10: - print(f" ... 和其他 {len(papers) - 10} 篇论文") - - # 开始PDF链接验证与进度显示 - print(f"\n🔗 正在验证PDF链接有效性...") - valid_count = 0 - - for i, paper in enumerate(papers): - # 显示进度 - progress = (i + 1) / len(papers) * 100 - progress_bar = '█' * int(progress // 5) + '░' * (20 - int(progress // 5)) - print(f"\r📋 [进度: {progress_bar}] {progress:.1f}% ({i+1}/{len(papers)}) 验证: {paper.get('title', '未知标题')[:30]}...", end='', flush=True) - - # 检查URL有效性 - if isinstance(paper.get('url'), str) and paper['url'].strip(): - valid_count += 1 - - print(f"\n✅ PDF链接验证完成: {valid_count}/{len(papers)} 个有效链接") - else: - print(f"⚠️ 未找到任何论文") - - return papers - - except Exception as e: - self.logger.error(f"获取论文列表失败: {str(e)}") - raise - """获取会议论文列表 - 带进度显示""" - try: - if conference not in self.conference_urls: - raise ValueError(f"不支持的会议: {conference}") - - base_url = self.conference_urls[conference] - papers = [] - - print(f"🔍 正在获取 {conference.upper()} {year} 论文列表...") - - # 规范化年份格式 - year = self.year_formats[conference](year) - - # 根据会议类型选择相应的解析方法 - papers = await self._get_papers_by_conference(conference, base_url, year) - if papers: - print(f"✨ 成功获取 {len(papers)} 篇论文信息") - return papers - - except Exception as e: - self.logger.error(f"获取论文列表失败: {e}") - raise - - async def _get_papers_by_conference(self, conference: str, base_url: str, year: str) -> List[Dict[str, Any]]: - """根据会议类型获取论文列表""" - try: - if conference == 'sp': - # IEEE S&P - full_url = f"{base_url}{year}" - return await self._get_sp_papers(full_url, year) - elif conference == 'ndss': - # NDSS - return await self._get_ndss_papers(base_url, year) - elif conference == 'usenix': - # USENIX Security - return await self._get_usenix_papers(base_url, year) - elif conference == 'ccs': - # ACM CCS - return await self._get_ccs_papers(base_url, year) - else: - raise ValueError(f"不支持的会议: {conference}") - except Exception as e: - self.logger.error(f"获取{conference.upper()} {year}论文列表失败: {e}") - raise - - async def _get_sp_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 IEEE S&P 论文列表""" - papers = [] - try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/94.0.4606.81' - } - async with self.session.get(base_url, headers=headers) as response: - if response.status_code != 200: - raise Exception(f"Failed to fetch SP {year} papers list") - - soup = BeautifulSoup(response.text, 'lxml') - paper_items = soup.find_all('div', class_='article-list__item') - - for item in paper_items: - title_elem = item.find('h3', class_='article-list__title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', class_='pdf-link') - - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.computer.org{url}" - papers.append({ - 'title': title, - 'url': url - }) - - except Exception as e: - self.logger.error(f"解析 SP {year} 论文列表失败: {e}") - raise - - return papers - - async def _get_ndss_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 NDSS 论文列表""" - papers = [] - try: - url = f"{base_url}ndss{year}/accepted-papers" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/94.0.4606.81' - } - async with self.session.get(url, headers=headers) as response: - if response.status_code != 200: - raise Exception(f"Failed to fetch NDSS {year} papers list") - - soup = BeautifulSoup(response.text, 'lxml') - paper_items = soup.find_all('div', class_='paper-item') - - for item in paper_items: - title_elem = item.find('h2', class_='title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', href=lambda x: x and x.endswith('.pdf')) - - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.ndss-symposium.org{url}" - papers.append({ - 'title': title, - 'url': url - }) - - except Exception as e: - self.logger.error(f"解析 NDSS {year} 论文列表失败: {e}") - raise - - return papers - - async def _get_usenix_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """获取 USENIX Security 论文列表""" - papers = [] - try: - url = f"{base_url}{year}/technical-sessions" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/94.0.4606.81' - } - async with self.session.get(url, headers=headers) as response: - if response.status_code != 200: - raise Exception(f"Failed to fetch USENIX {year} papers list") - - soup = BeautifulSoup(response.text, 'lxml') - paper_items = soup.find_all('div', class_='node-paper') - - for item in paper_items: - title_elem = item.find('h2', class_='node-title') - if not title_elem: - continue - - title = title_elem.text.strip() - pdf_link = item.find('a', href=lambda x: x and x.endswith('.pdf')) - - if pdf_link and 'href' in pdf_link.attrs: - url = pdf_link['href'] - if not url.startswith('http'): - url = f"https://www.usenix.org{url}" - papers.append({ - 'title': title, - 'url': url - }) - - except Exception as e: - self.logger.error(f"解析 USENIX {year} 论文列表失败: {e}") - raise - - return papers - - async def get_papers(self, conference: str, year: str) -> List[Dict[str, Any]]: - """ - 获取指定会议和年份的论文列表 - """ - try: - base_url = self.conference_urls.get(conference, {}).get(year) - if not base_url: - self.logger.error(f"未找到 {conference} {year} 的URL配置") - return [] - - if conference == 'ccs': - papers = await self._parse_ccs_papers(base_url, year) - elif conference == 'sp': - papers = await self._parse_sp_papers(base_url, year) - elif conference == 'ndss': - papers = await self._parse_ndss_papers(base_url, year) - elif conference == 'usenix': - papers = await self._parse_usenix_papers(base_url, year) - else: - self.logger.error(f"不支持的会议类型: {conference}") - return [] - - if papers: - print(f"✅ 成功获取 {len(papers)} 篇论文信息") - - # 显示找到的论文标题 - print(f"📋 找到的论文列表:") - for i, paper in enumerate(papers[:10]): - title = paper.get('title', '未知标题')[:60] - print(f" {i+1:2d}. {title}{'...' if len(paper.get('title', '')) > 60 else ''}") - - if len(papers) > 10: - print(f" ... 和其他 {len(papers) - 10} 篇论文") - - # 开始PDF链接验证与进度显示 - print(f"\n🔗 正在验证PDF链接有效性...") - valid_count = 0 - - for i, paper in enumerate(papers): - # 显示进度 - progress = (i + 1) / len(papers) * 100 - progress_bar = '█' * int(progress // 5) + '░' * (20 - int(progress // 5)) - print(f"\r📋 [进度: {progress_bar}] {progress:.1f}% ({i+1}/{len(papers)}) 验证: {paper.get('title', '未知标题')[:30]}...", end='', flush=True) - - # 检查URL有效性 - if isinstance(paper.get('url'), str) and paper['url'].strip(): - valid_count += 1 - - print(f"\n✅ PDF链接验证完成: {valid_count}/{len(papers)} 个有效链接") - else: - print(f"⚠️ 未找到任何论文") - - return papers - - except Exception as e: - self.logger.error(f"Error getting papers for {conference} {year}: {str(e)}") - raise - - - - async def _parse_ccs_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """ - 解析CCS论文列表的主入口函数。 - 使用持久化会话来执行所有相关请求。 - - Args: - base_url: 论文列表的基础URL - year: 会议年份 - - Returns: - 论文信息列表 - """ - papers = [] - try: - if not self.session: - raise RuntimeError("持久化会话未初始化。请在 'async with' 块中使用 PaperDownloader。") - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - paper_dois = await self._get_all_ccs_dois_from_proceedings_page(self.session, year) - if not paper_dois: - self.logger.error(f"❌ 未能为CCS {year} 获取任何论文的DOI。") - return [] - - self.logger.info(f"📚 开始通过API批量解析 {len(paper_dois)} 篇CCS论文的详细信息...") - - papers = await self._fetch_all_ccs_paper_details_via_api(self.session, paper_dois, year) - return papers - - except Exception as e: - self.logger.error(f"❌ CCS论文解析主流程错误: {str(e)}") - raise - - async def _fetch_all_ccs_paper_details_via_api(self, session: AsyncSession, dois: List[str], year: str) -> List[Dict[str, Any]]: - """ - 使用POST请求批量获取所有CCS论文的JSON数据并解析。 - """ - api_url = "https://dl.acm.org/action/exportCiteProcCitation" - headers = { - 'Host': 'dl.acm.org', - 'Cookie': '_cf_bm=12; _cfuvid=eKvDTOvVWyHDD5bNf_GLEG_fzdrvwq1g_7YIL.aZOJU-1756624678973-0.0.1.1-604800000', - 'Pragma': 'no-cache', - 'Accept': '*/*', - 'Dnt': '1', - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0', - 'Sec-Ch-Ua-Platform-Version': '"19.0.0"', - 'Origin': 'https://dl.acm.org', - 'Referer': 'https://dl.acm.org/doi/proceedings/10.1145/3658644', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', - 'Priority': 'u=1, i', - } - dois_payload = ",".join(dois) - data_string = f"dois={dois_payload}&targetFile=custom-bibtex&format=json" - content_length = str(len(data_string.encode('utf-8'))) - headers['Content-Length'] = content_length - try: - self.logger.info(f"🚀 正在向 {api_url} 发送单次POST请求以获取 {len(dois)} 篇论文的JSON数据...") - response = await session.post( - api_url, - data=data_string, - headers=headers, - impersonate="chrome110", - timeout=180 - ) - if response.status_code != 200: - self.logger.error(f"❌ 批量获取JSON失败,HTTP状态码: {response.status_code}") - self.logger.error(f"响应内容: {response.text[:500]}") - debug_file = Path(f"debug_ccs_api_error_{response.status_code}.html") - debug_file.write_text(response.text, encoding='utf-8') - self.logger.info(f"🐛 已将错误响应保存到 {debug_file.absolute()} 以供调试。") - return [] - json_data_str = response.text - self.logger.info(f"✅ 成功获取JSON数据,大小: {len(json_data_str)}字节。开始解析...") - debug_json_file = Path("debug_ccs_json_response.json") - debug_json_file.write_text(json_data_str, encoding='utf-8') - self.logger.info(f"🐛 已将原始JSON响应保存到 {debug_json_file.absolute()} 以供分析。") - papers = self._parse_json_data(json_data_str, year) - self.logger.info(f"✅ 成功解析 {len(papers)}/{len(dois)} 篇论文的元数据。") - return papers - except Exception as e: - self.logger.error(f"❌ 批量获取和解析JSON数据时发生严重错误: {str(e)}") - import traceback - traceback.print_exc() - return [] - - def _parse_json_data(self, json_data_str: str, year: str) -> List[Dict[str, Any]]: - """ - 解析从ACM API返回的JSON数据。 - """ - papers = [] - try: - data = json.loads(json_data_str) - for item_dict in data.get('items', []): - for doi, details in item_dict.items(): - try: - title = details.get('title', f"未知标题 (DOI: {doi})") - authors_list = [] - for author_info in details.get('author', []): - given_name = author_info.get('given', '') - family_name = author_info.get('family', '') - authors_list.append(f"{given_name} {family_name}".strip()) - abstract = details.get('abstract', '摘要不可用') - pdf_url = f"https://dl.acm.org/doi/pdf/{doi}" - papers.append({ - 'title': title, - 'authors': authors_list, - 'abstract': abstract, - 'url': pdf_url, - 'conference': "CCS", - 'year': year - }) - except Exception as e: - self.logger.warning(f"解析单个JSON条目时出错 (DOI: {doi}): {e}") - continue - return papers - except json.JSONDecodeError as e: - self.logger.error(f"❌ JSON解析失败: {e}") - return [] - except Exception as e: - self.logger.error(f"❌ 处理JSON数据时发生未知错误: {e}") - return [] - - - def _parse_bibtex_data(self, bibtex_data: str, year: str) -> List[Dict[str, Any]]: - """ - 解析BibTeX数据字符串并返回论文列表。 - 使用更健壮的正则表达式来处理复杂的BibTeX格式。 - """ - papers = [] - # 使用更可靠的方式分割条目:按换行符后的'@'分割 - entries = re.split(r'\n@', bibtex_data) - - for entry in entries: - if not entry.strip() or not entry.startswith('inproceedings'): - continue - - try: - # 健壮的DOI提取 - doi_match = re.search(r'doi\s*=\s*\{([^}]+)\}', entry, re.IGNORECASE) - doi = doi_match.group(1).strip() if doi_match else "未知DOI" - - # 健壮的标题提取,能处理嵌套花括号 - title_match = re.search(r'title\s*=\s*\{((?:[^{}]|\{[^{}]*\})+)\}', entry, re.IGNORECASE) - title = title_match.group(1).strip().replace("{", "").replace("}", "") if title_match else f"未知标题 (DOI: {doi})" - - # 健壮的作者提取 - author_match = re.search(r'author\s*=\s*\{([^}]+)\}', entry, re.IGNORECASE) - authors_str = author_match.group(1) if author_match else "" - authors_list = [name.strip().replace("{", "").replace("}", "") for name in authors_str.split(' and ')] - - abstract = "摘要需访问论文页面查看" - pdf_url = f"https://dl.acm.org/doi/pdf/{doi}" - - papers.append({ - 'title': title, - 'authors': authors_list, - 'abstract': abstract, - 'url': pdf_url, - 'conference': "CCS", - 'year': year - }) - except Exception as e: - self.logger.warning(f"解析单个BibTeX条目时出错: {e}\n条目内容: {entry[:300]}...") - continue - return papers - - async def _get_all_ccs_dois_from_proceedings_page(self, session: AsyncSession, year: str) -> Optional[List[str]]: - """ - 获取CCS会议指定年份所有论文的DOI列表。 - 使用传入的持久化会话。 - """ - if not CURL_CFFI_AVAILABLE: - self.logger.error("❌ curl_cffi 未安装,无法执行CCS论文抓取。") - return None - - try: - short_year_str = f"'{year}" - full_year_str = f"20{year}" if len(year) == 2 else year - - proceedings_list_url = 'https://dl.acm.org/conference/ccs/proceedings' - self.logger.info(f"🌐 正在通过持久化会话访问CCS会议列表页面: {proceedings_list_url}") - - # 使用更完整的请求头来模拟浏览器行为 - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Cache-Control': 'max-age=0', - 'Connection': 'keep-alive', - 'DNT': '1', - 'Host': 'dl.acm.org', - 'Sec-Ch-Ua': '"Chromium";v="116", "Not)A;Brand";v="24", "Microsoft Edge";v="116"', - 'Sec-Ch-Ua-Mobile': '?0', - 'Sec-Ch-Ua-Platform': '"Windows"', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-User': '?1', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.0.0' - } - - # 尝试多次,使用不同的浏览器模拟配置 - for browser in ["chrome110", "chrome99", "chrome100", "safari15_3"]: - try: - response = await session.get( - proceedings_list_url, - impersonate=browser, - headers=headers, - timeout=45 - ) - - if response.status_code == 200: - self.logger.info(f"✅ 使用 {browser} 成功访问") - break - else: - self.logger.warning(f"使用 {browser} 失败,HTTP状态码: {response.status_code}") - except Exception as e: - self.logger.warning(f"使用 {browser} 时出错: {str(e)}") - await asyncio.sleep(2) # 失败后等待一下再重试 - continue - - if response.status_code != 200: - self.logger.error(f"❌ 访问CCS会议列表页面失败,HTTP状态码: {response.status_code}") - return None - - self.logger.info("✅ 成功获取会议列表页面。") - soup = BeautifulSoup(response.text, 'html.parser') - - target_proc_url = None - proc_items = soup.select('li.conference__proceedings div.conference__title a') - for item in proc_items: - link_text = item.get_text(strip=True) - if short_year_str in link_text or full_year_str in link_text: - target_proc_url = urllib.parse.urljoin(proceedings_list_url, item['href']) - self.logger.info(f"✅ 找到 CCS {year} 会议录链接: {target_proc_url}") - break - - if not target_proc_url: - self.logger.error(f"❌ 未能在页面上找到 CCS {year} 的会议录链接。") - debug_file = Path("debug_acm_proceedings_list.html") - debug_file.write_text(response.text, encoding='utf-8') - self.logger.info(f"🐛 已将会议列表页面内容保存到 {debug_file.absolute()} 以供调试。") - return None - - self.logger.info(f"🌐 正在访问 CCS {year} 论文列表页面...") - response = await session.get(target_proc_url, impersonate="chrome110", timeout=45) - - if response.status_code != 200: - self.logger.error(f"❌ 访问 CCS {year} 论文列表页面失败,HTTP状态码: {response.status_code}") - return None - - self.logger.info(f"✅ 成功获取 CCS {year} 论文列表页面。") - soup = BeautifulSoup(response.text, 'html.parser') - - all_dois = [] - # 从隐藏的input中提取所有DOI - doi_inputs = soup.select('input.section--dois') - for doi_input in doi_inputs: - dois_str = doi_input.get('value', '') - if dois_str: - all_dois.extend(dois_str.split(',')) - - if not all_dois: - self.logger.warning(f"未能在 CCS {year} 页面提取到任何论文DOI。请检查页面结构是否已更改。") - debug_file = Path(f"debug_ccs_{year}_papers.html") - debug_file.write_text(response.text, encoding='utf-8') - self.logger.info(f"🐛 已将论文列表页面内容保存到 {debug_file.absolute()} 以供调试。") - return None - - # 去重并清洗 - unique_dois = sorted(list(set(doi.strip() for doi in all_dois if doi.strip()))) - self.logger.info(f"✅ 成功提取 {len(unique_dois)} 个唯一的 CCS {year} 论文DOI。") - return unique_dois - - except Exception as e: - self.logger.error(f"❌ 获取CCS论文DOI时发生严重错误: {str(e)}") - import traceback - traceback.print_exc() - return None - - diff --git a/src/paperbot/utils/downloader_back.py b/src/paperbot/utils/downloader_back.py deleted file mode 100644 index e049c3e7..00000000 --- a/src/paperbot/utils/downloader_back.py +++ /dev/null @@ -1,1206 +0,0 @@ -# securipaperbot/utils/downloader.py - -from typing import Dict, List, Any, Optional -import aiohttp -import asyncio -from pathlib import Path -import urllib.parse -from bs4 import BeautifulSoup -import re -import time -import random -from datetime import datetime -import logging - - -# 使用标准日志,避免相对导入问题 -def setup_logger(name): - logger = logging.getLogger(name) - if not logger.handlers: - logger.setLevel(logging.INFO) - handler = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - logger.addHandler(handler) - return logger - - -class PaperDownloader: - """论文下载工具类 - 优化版本""" - - def __init__(self, config: Optional[Dict[str, Any]] = None): - self.config = config or {} - self.logger = setup_logger(__name__) - self.download_path = Path(self.config.get('download_path', './papers')) - self.download_path.mkdir(parents=True, exist_ok=True) - - # 配置下载重试参数 - 保守稳定的参数 - self.max_retries = self.config.get('max_retries', 3) - self.retry_delay = self.config.get('retry_delay', 3) # 增加到3秒 - - # 完全关闭并发 - 使用单线程确保稳定性 - max_concurrent = 1 # 强制设为1,不使用并发 - self.semaphore = asyncio.Semaphore(max_concurrent) - - # 会议URL模板 - self.conference_urls = { - 'ccs': 'https://dl.acm.org/doi/proceedings', - 'sp': 'https://ieeexplore.ieee.org/xpl/conhome/', - 'ndss': 'https://www.ndss-symposium.org/', - 'usenix': 'https://www.usenix.org/conference/' - } - - async def download_paper(self, url: str, title: str, paper_index: int = 0, total_papers: int = 0) -> Dict[str, Any]: - """下载单篇论文 - 优化版本""" - async with self.semaphore: - try: - # 生成文件名 - safe_title = self._sanitize_filename(title) - file_path = self.download_path / f"{safe_title}.pdf" - - # 显示下载进度(与NDSS/USENIX保持一致) - if total_papers > 0: - progress = (paper_index + 1) / total_papers * 100 - print(f"💾 [{paper_index+1}/{total_papers}] 下载: {title[:50]}{'...' if len(title) > 50 else ''}") - - # 检查是否已下载并验证文件 - if file_path.exists(): - # 验证文件大小,过小的文件可能是错误页面 - file_size = file_path.stat().st_size - if file_size > 1024: # 大于1KB认为有效 - return { - 'success': True, - 'path': str(file_path), - 'cached': True, - 'size': file_size - } - else: - # 删除无效文件 - file_path.unlink() - self.logger.warning(f"Removed invalid cached file: {file_path}") - - # 下载论文 - content = await self._download_with_retry(url) - if content: - # 验证下载内容 - if len(content) < 1024: - raise Exception(f"Downloaded content too small ({len(content)} bytes), likely an error page") - - # 保存文件 - file_path.write_bytes(content) - file_size = len(content) - - return { - 'success': True, - 'path': str(file_path), - 'cached': False, - 'size': file_size - } - else: - raise Exception("Failed to download paper - no content received") - - except Exception as e: - self.logger.error(f"Error downloading paper {title}: {str(e)}") - return { - 'success': False, - 'error': str(e) - } - - async def get_conference_papers(self, conference: str, year: str) -> List[Dict[str, Any]]: - """获取会议论文列表 - 带进度显示""" - try: - if conference not in self.conference_urls: - raise ValueError(f"Unsupported conference: {conference}") - - base_url = self.conference_urls[conference] - papers = [] - - print(f"🔍 正在获取 {conference.upper()} {year} 论文列表...") - - # 根据会议类型选择相应的解析方法 - if conference == 'ccs': - papers = await self._parse_ccs_papers(base_url, year) - elif conference == 'sp': - papers = await self._parse_sp_papers(base_url, year) - elif conference == 'ndss': - papers = await self._parse_ndss_papers(base_url, year) - elif conference == 'usenix': - papers = await self._parse_usenix_papers(base_url, year) - - if papers: - print(f"✅ 成功获取 {len(papers)} 篇论文信息") - - # 显示找到的论文标题 - print(f"📋 找到的论文列表:") - for i, paper in enumerate(papers[:10]): - title = paper.get('title', '未知标题')[:60] - print(f" {i+1:2d}. {title}{'...' if len(paper.get('title', '')) > 60 else ''}") - - if len(papers) > 10: - print(f" ... 和其他 {len(papers) - 10} 篇论文") - - # 开始PDF链接验证与进度显示 - print(f"\n🔗 正在验证PDF链接有效性...") - valid_count = 0 - - for i, paper in enumerate(papers): - # 显示进度 - progress = (i + 1) / len(papers) * 100 - progress_bar = '█' * int(progress // 5) + '░' * (20 - int(progress // 5)) - print(f"\r📋 [进度: {progress_bar}] {progress:.1f}% ({i+1}/{len(papers)}) 验证: {paper.get('title', '未知标题')[:30]}...", end='', flush=True) - - # 检查URL有效性 - if isinstance(paper.get('url'), str) and paper['url'].strip(): - valid_count += 1 - - print(f"\n✅ PDF链接验证完成: {valid_count}/{len(papers)} 个有效链接") - else: - print(f"⚠️ 未找到任何论文") - - return papers - - except Exception as e: - self.logger.error(f"Error getting papers for {conference} {year}: {str(e)}") - raise - - - async def _parse_ccs_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - return none - - - async def _parse_ndss_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """解析NDSS论文列表 - 优化版本带进度显示""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - url = f"{base_url}ndss{full_year}/accepted-papers/" - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - enhanced_timeout = aiohttp.ClientTimeout(total=120, connect=30, sock_read=60) - - print(f"🌐 访问 NDSS {year} 会议页面...") - - max_retries = 3 - for attempt in range(max_retries): - try: - async with aiohttp.ClientSession(headers=headers, timeout=enhanced_timeout) as session: - - async with session.get(url) as response: - print(f"⚡ 尝试 {attempt + 1}/{max_retries}: HTTP {response.status}") - - if response.status == 200: - print(f"📝 正在解析页面内容...") - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 查找NDSS论文容器 - paper_containers = soup.find_all('div', class_='tag-box rel-paper') - print(f"📚 找到 {len(paper_containers)} 个论文容器") - - if not paper_containers: - print(f"⚠️ 未找到论文容器,尝试其他选择器...") - # 尝试其他可能的选择器 - paper_containers = soup.find_all('div', class_='paper') or soup.find_all('article') - print(f"🔄 备用选择器找到 {len(paper_containers)} 个容器") - - # 处理论文容器并显示简单进度 - for idx, container in enumerate(paper_containers): - if idx % 5 == 0 or idx == len(paper_containers) - 1: # 每5个显示一次进度 - progress = (idx + 1) / len(paper_containers) * 100 - print(f"🔍 解析进度: {progress:.1f}% ({idx+1}/{len(paper_containers)})") - - try: - # 提取标题 - 尝试多种选择器 - title_elem = (container.find('h3', class_='blog-post-title') or - container.find('h3') or - container.find('h2') or - container.find('h1')) - - if not title_elem: - continue - - title = title_elem.get_text().strip() - - # 显示找到的论文标题 - print(f"📄 [{idx+1}/{len(paper_containers)}] 找到论文: {title[:70]}{'...' if len(title) > 70 else ''}") - - # 提取作者信息 - author_elem = container.find('p') - authors_text = author_elem.get_text().strip() if author_elem else '' - authors = [author.strip() for author in authors_text.split(',')] if authors_text else [] - - # 提取详情页链接 - detail_link = (container.find('a', class_='paper-link-abs') or - container.find('a', href=True)) - detail_url = detail_link.get('href') if detail_link else '' - - # 提取PDF链接 - pdf_url = '' - if detail_url: - pdf_url = await self._get_ndss_pdf_from_detail_page(session, detail_url) - - paper_info = { - 'title': title, - 'authors': authors, - 'abstract': '', - 'url': pdf_url, - 'detail_url': detail_url, - 'doi': '' - } - - if title and len(title) > 10: - papers.append(paper_info) - - except Exception as e: - self.logger.warning(f"Error parsing paper container {idx}: {str(e)}") - continue - - print(f"\n✅ 基础信息解析完成: {len(papers)} 篇论文") - return papers - - elif response.status == 404: - print(f"❌ NDSS {year} 页面不存在") - return [] - else: - print(f"⚠️ HTTP {response.status},正在重试...") - if attempt < max_retries - 1: - await asyncio.sleep(2 ** attempt) - continue - else: - raise Exception(f"HTTP {response.status}") - - except asyncio.TimeoutError: - print(f"⏰ 超时 {attempt + 1}/{max_retries}") - if attempt < max_retries - 1: - await asyncio.sleep(5 * (attempt + 1)) - continue - else: - raise Exception("Connection timeout after retries") - - except Exception as e: - print(f"❌ 尝试 {attempt + 1} 失败: {str(e)}") - if attempt < max_retries - 1: - await asyncio.sleep(3 * (attempt + 1)) - continue - else: - raise - - return papers - - async def _get_ndss_pdf_from_detail_page(self, session: aiohttp.ClientSession, detail_url: str) -> str: - """从 NDSS 论文详情页获取 PDF 链接 - 优化版本""" - if not detail_url: - return '' - - try: - # 使用更短的超时时间提高速度 - timeout = aiohttp.ClientTimeout(total=15, connect=5) - - async with session.get(detail_url, timeout=timeout) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 尝试多种 PDF 链接模式 - pdf_patterns = [ - # 直接 PDF 链接 - soup.find('a', href=re.compile(r'\.pdf$', re.I)), - # 包含 "pdf" 文本的链接 - soup.find('a', string=re.compile(r'pdf', re.I)), - # 包含 "download" 的链接 - soup.find('a', string=re.compile(r'download', re.I)), - # 在 href 中包含 "pdf" 的链接 - soup.find('a', href=re.compile(r'pdf', re.I)) - ] - - for pdf_link in pdf_patterns: - if pdf_link and hasattr(pdf_link, 'get'): - pdf_url = pdf_link.get('href') - if pdf_url and isinstance(pdf_url, str): - # 确保 URL 是完整的 - if not pdf_url.startswith('http'): - if pdf_url.startswith('/'): - pdf_url = f"https://www.ndss-symposium.org{pdf_url}" - else: - # 相对路径 - base_url = '/'.join(detail_url.split('/')[:-1]) - pdf_url = f"{base_url}/{pdf_url}" - - # 验证 URL 是否以 .pdf 结尾或包含 pdf - if pdf_url.lower().endswith('.pdf') or 'pdf' in pdf_url.lower(): - return pdf_url - - # 如果没有找到直接链接,尝试查找内嵌的PDF - iframe_pdf = soup.find('iframe', src=re.compile(r'\.pdf', re.I)) - if iframe_pdf and hasattr(iframe_pdf, 'get'): - pdf_url = iframe_pdf.get('src') - if pdf_url and isinstance(pdf_url, str): - if not pdf_url.startswith('http'): - pdf_url = f"https://www.ndss-symposium.org{pdf_url}" - return pdf_url - - except asyncio.TimeoutError: - self.logger.debug(f"Timeout getting PDF from {detail_url}") - except Exception as e: - self.logger.debug(f"Error getting PDF from {detail_url}: {str(e)}") - - return '' - - async def _parse_sp_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """解析SP论文列表 - 使用Computer.org GraphQL API直接获取proceedings ID""" - papers = [] - full_year = f"20{year}" if len(year) == 2 else year - - print(f"🌐 正在获取 SP {year} proceedings ID...") - - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36', - 'Accept': 'application/json, text/plain, */*', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', - 'Content-Type': 'application/json', - 'Origin': 'https://www.computer.org', - 'Sec-Fetch-Site': 'same-origin', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Dest': 'empty', - 'Referer': 'https://www.computer.org/csdl/proceedings/1000646', - 'Connection': 'keep-alive', - 'Cache-Control': 'no-cache' - } - - try: - async with aiohttp.ClientSession(headers=headers) as session: - # 使用GraphQL API获取SP会议的所有proceedings - graphql_url = "https://www.computer.org/csdl/api/v1/graphql" - graphql_query = { - "variables": {"groupId": "1000646"}, # SP会议的组ID - "query": "query ($groupId: String) {\n proceedings(groupId: $groupId) {\n id\n acronym\n title\n volume\n displayVolume\n year\n __typename\n }\n}" - } - - timeout = aiohttp.ClientTimeout(total=60, connect=30) - async with session.post(graphql_url, json=graphql_query, timeout=timeout) as response: - if response.status == 200: - data = await response.json() - proceedings_list = data.get('data', {}).get('proceedings', []) - - print(f"✅ 获取到 {len(proceedings_list)} 个 proceedings") - - # 查找指定年份的proceedings - target_proceeding = None - for proc in proceedings_list: - if str(proc.get('year')) == full_year: - target_proceeding = proc - break - - if not target_proceeding: - print(f"❌ 未找到 {full_year} 年的proceedings") - return papers - - proceedings_id = target_proceeding.get('id') - print(f"🆔 找到 SP {full_year} proceedings ID: {proceedings_id}") - - # 调用Computer.org API获取论文数据 - all_papers = await self._call_computer_org_api(session, proceedings_id) - - # 处理所有论文 - if all_papers: - print(f"✅ 成功获取 {len(all_papers)} 篇论文") - - # 解析真正的PDF下载链接 - print(f"🔗 开始解析 {len(all_papers)} 个PDF下载链接...") - for i, paper in enumerate(all_papers): - if paper.get('needs_pdf_resolution') and paper.get('url'): - print(f"📋 PDF解析进度: {i+1}/{len(all_papers)} - {paper.get('title', '')[:50]}...") - - real_pdf_url = await self._resolve_ieee_pdf_url(session, paper['url']) - if real_pdf_url: - paper['url'] = real_pdf_url - print(f"✅ 解析成功: {real_pdf_url[:60]}...") - else: - print(f"❌ PDF链接解析失败") - paper.pop('needs_pdf_resolution', None) - - papers = all_papers # 返回所有论文 - else: - papers = [] - - return papers - else: - print(f"❌ GraphQL API调用失败: HTTP {response.status}") - return [] - - except Exception as e: - print(f"❌ SP解析错误: {str(e)}") - return [] - - - async def _call_computer_org_api(self, session: aiohttp.ClientSession, proceedings_id: str) -> List[Dict[str, Any]]: - """调用Computer.org API获取论文数据""" - papers = [] - api_url = f"https://www.computer.org/csdl/api/v1/citation/asciitext/proceedings/{proceedings_id}" - - print(f"🔗 调用Computer.org API: {api_url}") - - # 使用用户提供的完整请求头 - api_headers = { - 'Host': 'www.computer.org', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', - 'Accept': '*/*', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br, zstd', - 'DNT': '1', - 'Connection': 'keep-alive', - 'Referer': f'https://www.computer.org/csdl/proceedings/sp/2024/{proceedings_id}', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'same-origin', - 'Priority': 'u=1, i', - 'Pragma': 'no-cache', - 'Cache-Control': 'no-cache' - } - - try: - timeout = aiohttp.ClientTimeout(total=60, connect=30) - async with session.get(api_url, headers=api_headers, timeout=timeout) as response: - if response.status == 200: - print(f"✅ API调用成功") - text_data = await response.text() - papers = self._parse_citation_data(text_data) - print(f"📚 解析到 {len(papers)} 篇论文") - return papers - else: - print(f"❌ API调用失败: HTTP {response.status}") - return [] - - except Exception as e: - print(f"❌ API调用错误: {str(e)}") - return [] - - def _parse_citation_data(self, citation_text: str) -> List[Dict[str, Any]]: - """解析引用数据获取论文信息""" - papers = [] - - try: - # 按条目分割 - entries = re.split(r'\n\s*\n', citation_text.strip()) - - for entry in entries: - if not entry.strip(): - continue - - paper_info = self._parse_single_citation(entry.strip()) - if paper_info: - papers.append(paper_info) - - return papers - - except Exception as e: - self.logger.error(f"解析引用数据失败: {str(e)}") - return [] - - def _parse_single_citation(self, citation: str) -> Optional[Dict[str, Any]]: - """解析单个引用条目""" - try: - # 查找标题 (通常在引号内或作为第一行) - title_match = re.search(r'"([^"]+)"', citation) - if not title_match: - # 备用:提取第一行作为标题 - lines = citation.split('\n') - title = lines[0].strip() if lines else '' - else: - title = title_match.group(1) - - if not title or len(title) < 10: - return None - - # 检查是否包含keywords:{}(空大括号),这表示非论文内容 - if re.search(r'keywords:\s*\{\s*\}', citation, re.I): - return None - - # 过滤特定的非论文标题模式 - non_paper_patterns = [ - r'author\s+index', - r'table\s+of\s+contents', - r'program\s+committee', - r'organiz(ing|ation)\s+committee', - r'chair\s+message', - r'welcome\s+message', - r'foreword', - r'preface', - r'index\s+terms', - r'subject\s+index' - ] - - for pattern in non_paper_patterns: - if re.search(pattern, title, re.I): - return None - - # 查找作者 - authors = [] - author_match = re.search(r'Author\(s\):\s*([^\n]+)', citation) - if author_match: - authors_text = author_match.group(1) - authors = [author.strip() for author in authors_text.split(',')] - - # 查找所有链接,只保留包含doi.ieeecomputersociety.org的链接 - all_urls = re.findall(r'https?://[^\s]+', citation) - doi_url = '' - - for url in all_urls: - if 'doi.ieeecomputersociety.org' in url: - doi_url = url - break - - # 如果没找到包含doi.ieeecomputersociety.org的链接,跳过这篇论文 - if not doi_url: - return None - - return { - 'title': title, - 'authors': authors, - 'abstract': '', - 'url': doi_url, # 这里先保存DOI链接,稍后会解析真正的PDF链接 - 'doi': doi_url, - 'needs_pdf_resolution': True # 标记需要解析PDF链接 - } - - except Exception as e: - self.logger.error(f"解析单个引用失败: {str(e)}") - return None - - async def _resolve_ieee_pdf_url(self, session: aiohttp.ClientSession, doi_url: str) -> str: - """从IEEE DOI页面解析真正的PDF下载链接""" - try: - if not doi_url: - return '' - - print(f"🔍 解析PDF链接: {doi_url[:50]}...") - - # 访问DOI页面,跟随重定向 - timeout = aiohttp.ClientTimeout(total=30, connect=10) - - # 设置请求头模拟浏览器 - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', - 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'DNT': '1', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1' - } - - async with session.get(doi_url, timeout=timeout, headers=headers, allow_redirects=True) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - print(f"✅ 成功访问页面: {str(response.url)[:60]}...") - - # 查找PDF下载链接的多种模式 - pdf_patterns = [ - # Computer.org特定的DOWNLOAD PDF按钮 - soup.find('a', string=re.compile(r'DOWNLOAD PDF', re.I)), - soup.find('a', text=re.compile(r'download.*pdf', re.I)), - soup.find('button', string=re.compile(r'download.*pdf', re.I)), - - # 带有PDF相关class的链接 - soup.find('a', class_=re.compile(r'download|pdf', re.I)), - soup.find('a', attrs={'aria-label': re.compile(r'download|pdf', re.I)}), - - # 直接PDF链接 - soup.find('a', href=re.compile(r'\.pdf$', re.I)), - - # Meta标签中的PDF链接 - soup.find('meta', attrs={'name': 'citation_pdf_url'}), - soup.find('meta', attrs={'property': 'citation_pdf_url'}) - ] - - for pattern in pdf_patterns: - if pattern: - if pattern.name == 'meta': - pdf_url = pattern.get('content') - else: - pdf_url = pattern.get('href') - - if pdf_url: - # 补全相对URL - if not pdf_url.startswith('http'): - if pdf_url.startswith('/'): - pdf_url = f"https://www.computer.org{pdf_url}" - else: - base_url = '/'.join(str(response.url).split('/')[:-1]) - pdf_url = f"{base_url}/{pdf_url}" - - print(f"✅ 找到PDF链接: {pdf_url[:60]}...") - return pdf_url - - # 如果没找到直接链接,尝试查找data-*属性中PDF链接 - for element in soup.find_all(attrs={'data-pdf-url': True}): - pdf_url = element.get('data-pdf-url') - if pdf_url: - if not pdf_url.startswith('http'): - pdf_url = f"https://www.computer.org{pdf_url}" - print(f"🔗 从data属性找到PDF: {pdf_url[:60]}...") - return pdf_url - - # 尝试查找包含PDF的所有链接 - all_links = soup.find_all('a', href=True) - for link in all_links: - href = link.get('href') - if href and ('pdf' in href.lower() or 'download' in href.lower()): - if not href.startswith('http'): - if href.startswith('/'): - href = f"https://www.computer.org{href}" - else: - base_url = '/'.join(str(response.url).split('/')[:-1]) - href = f"{base_url}/{href}" - print(f"🔍 候选PDF链接: {href[:60]}...") - return href - - # 最后尝试:从当前页面URL构造PDF链接 - current_url = str(response.url) - if '/proceedings-article/' in current_url: - # 提取文章ID - parts = current_url.rstrip('/').split('/') - if parts: - article_id = parts[-1] - # 使用真正的Computer.org PDF下载API - pdf_url = f"https://www.computer.org/csdl/pds/api/csdl/proceedings/download-article/{article_id}/pdf" - print(f"🔗 构造PDF API链接: {pdf_url}") - return pdf_url - - else: - print(f"❌ DOI访问失败: HTTP {response.status}") - - except Exception as e: - print(f"❌ 解析PDF链接失败: {str(e)}") - - return '' - - async def _parse_usenix_papers(self, base_url: str, year: str) -> List[Dict[str, Any]]: - """解析USENIX Security论文列表 - 精确修复版本""" - papers = [] - url = f"https://www.usenix.org/conference/usenixsecurity{year}/technical-sessions" - - print(f"🌐 正在解析 USENIX Security {year} 论文列表...") - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - - try: - async with aiohttp.ClientSession(headers=headers) as session: - timeout = aiohttp.ClientTimeout(total=30, connect=10) - - async with session.get(url, timeout=timeout) as response: - if response.status == 200: - print(f"✅ 页面访问成功,开始解析...") - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 使用正确的选择器找到论文节点 - paper_nodes = soup.find_all('article', class_='node node-paper view-mode-schedule') - - print(f"📚 找到 {len(paper_nodes)} 个论文节点") - - if not paper_nodes: - print(f"⚠️ 未找到有效的论文节点") - return papers - - for idx, node in enumerate(paper_nodes): - # 显示进度 - if idx % 10 == 0 or idx == len(paper_nodes) - 1: - progress = (idx + 1) / len(paper_nodes) * 100 - print(f"🔍 解析进度: {progress:.1f}% ({idx+1}/{len(paper_nodes)})") - - try: - # 提取标题 - 使用h2标签 - title_elem = node.find('h2') - if not title_elem: - continue - - # 从链接中获取标题文本 - link_elem = title_elem.find('a') - if not link_elem: - continue - - title = link_elem.get_text().strip() - - # 验证是否为有效论文标题 - if not self._is_valid_paper_title(title): - continue - - # 显示找到的论文标题 - print(f"📄 [{idx+1}/{len(paper_nodes)}] 找到论文: {title[:70]}{'...' if len(title) > 70 else ''}") - - # 提取作者信息 - authors = [] - author_container = node.find('div', class_='field-name-field-paper-people-text') - if author_container: - authors_text = author_container.get_text().strip() - if authors_text: - # 简单解析作者列表 - authors = [authors_text.split(',')[0].strip()] if ',' in authors_text else [authors_text] - - # 获取PDF链接 - pdf_url = await self._get_usenix_pdf_url_simple(session, node) - - if pdf_url: - papers.append({ - 'title': title, - 'authors': authors, - 'abstract': '', - 'url': pdf_url, - 'doi': '' - }) - - except Exception as e: - self.logger.debug(f"解析节点 {idx} 失败: {str(e)}") - continue - - print(f"✅ USENIX 解析完成: {len(papers)} 篇论文") - return papers - - elif response.status == 404: - print(f"❌ USENIX {year} 页面不存在") - return [] - else: - print(f"❌ HTTP {response.status}") - return [] - - except asyncio.TimeoutError: - print(f"⏰ 访问超时") - return [] - except Exception as e: - print(f"❌ 解析错误: {str(e)}") - return [] - - async def _get_usenix_pdf_url_simple(self, session: aiohttp.ClientSession, node) -> str: - """简化的USENIX PDF链接获取方法""" - try: - # 1. 首先查找直接的PDF链接 - pdf_link = node.find('a', href=re.compile(r'\.pdf$', re.I)) - if pdf_link: - href = pdf_link.get('href') - if href: - return self._complete_usenix_url(href) - - # 2. 查找presentation页面链接 - presentation_link = node.find('a', href=re.compile(r'/presentation/', re.I)) - if presentation_link: - presentation_url = presentation_link.get('href') - if presentation_url: - presentation_url = self._complete_usenix_url(presentation_url) - - # 从presentation页面获取PDF链接 - try: - timeout = aiohttp.ClientTimeout(total=10, connect=5) - async with session.get(presentation_url, timeout=timeout) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - - # 查找PDF下载链接 - pdf_link = soup.find('a', href=re.compile(r'\.pdf$', re.I)) - if pdf_link: - pdf_url = pdf_link.get('href') - if pdf_url: - return self._complete_usenix_url(pdf_url) - except Exception as e: - self.logger.debug(f"presentation页面获取失败: {str(e)}") - - return '' - - except Exception as e: - self.logger.debug(f"获取PDF链接失败: {str(e)}") - return '' - - def _is_valid_paper_title(self, title: str) -> bool: - """验证是否为有效的论文标题""" - if not title or len(title) < 10: # 降低最小长度要求 - return False - - # 只排除明显的非论文内容,减少过滤 - exclude_keywords = [ - 'technical session', 'session chair', 'keynote', 'tutorial', - 'workshop', 'break', 'lunch', 'coffee break', 'opening remarks', - 'closing remarks', 'panel discussion', 'poster session' - ] - - title_lower = title.lower() - for keyword in exclude_keywords: - if keyword in title_lower: - return False - - # 放宽条件,只要有一定长度就认为是有效论文 - return len(title) > 15 - - def _complete_usenix_url(self, url: str) -> str: - """补全USENIX URL""" - if not url: - return '' - - if url.startswith('http'): - return url - elif url.startswith('/'): - return f"https://www.usenix.org{url}" - else: - return f"https://www.usenix.org/{url}" - - - def _extract_ieee_pdf_url(self, paper_element) -> str: - """从IEEE页面元素中提取PDF链接""" - try: - # 查找PDF链接 - pdf_link = paper_element.find('a', href=re.compile(r'.*\.pdf')) - if pdf_link: - return pdf_link['href'] - - # 查找文章链接并构造PDF URL - article_link = paper_element.find('a', href=re.compile(r'/document/')) - if article_link: - doc_id = re.search(r'/document/(\d+)', article_link['href']) - if doc_id: - return f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_id.group(1)}" - - return '' - except Exception as e: - self.logger.error(f"Error extracting IEEE PDF URL: {str(e)}") - return '' - """从IEEE页面元素中提取PDF链接""" - try: - # 查找PDF链接 - pdf_link = paper_element.find('a', href=re.compile(r'.*\.pdf')) - if pdf_link: - return pdf_link['href'] - - # 查找文章链接并构造PDF URL - article_link = paper_element.find('a', href=re.compile(r'/document/')) - if article_link: - doc_id = re.search(r'/document/(\d+)', article_link['href']) - if doc_id: - return f"https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber={doc_id.group(1)}" - - return '' - except Exception as e: - self.logger.error(f"Error extracting IEEE PDF URL: {str(e)}") - return '' - - - - async def _download_with_retry(self, url: str) -> Optional[bytes]: - """带重试机制的下载 - 支持CCS反爬策略""" - if not url: - return None - - # 对ACM CCS论文使用特殊处理 - if 'dl.acm.org/doi/pdf/' in url: - return await self._download_acm_ccs_pdf_enhanced(url) - - # 对Computer.org的SP论文使用特殊处理 - if 'computer.org/csdl/pds/api' in url: - return await self._download_computer_org_pdf(url) - - # 其他链接使用原有方法 - return await self._download_with_aiohttp(url) - - async def _download_acm_ccs_pdf_enhanced(self, url: str) -> Optional[bytes]: - """增强版ACM CCS PDF下载方法 - 完整的反爬策略""" - try: - # 1. 首先访问数据库主页获取session - await self._warm_up_acm_session() - - # 2. 提取DOI信息 - doi_match = re.search(r'/doi/pdf/(.+)', url) - if not doi_match: - raise Exception("Invalid DOI URL format") - - doi = doi_match.group(1) - acm_page_url = f"https://dl.acm.org/doi/{doi}" - - # 3. 先访问论文页面获取cookies和referer - print(f"🔍 访问论文页面: {acm_page_url[:60]}...") - - page_headers = self._get_anti_crawler_headers() - - async with aiohttp.ClientSession(headers=page_headers) as session: - # 随机延迟 - await asyncio.sleep(random.uniform(2, 5)) - - timeout = aiohttp.ClientTimeout(total=30, connect=10) - async with session.get(acm_page_url, timeout=timeout) as response: - if response.status == 200: - print(f"✅ 成功访问论文页面") - - # 4. 现在下载PDF,使用论文页面作为referer - pdf_headers = self._get_pdf_download_headers(acm_page_url) - - # 添加更长的延迟 - await asyncio.sleep(random.uniform(3, 7)) - - print(f"📥 开始下载PDF: {url[:60]}...") - - async with session.get(url, headers=pdf_headers, timeout=timeout) as pdf_response: - if pdf_response.status == 200: - content = await pdf_response.read() - - # 验证PDF文件 - if self._is_valid_pdf(content): - print(f"✅ PDF下载成功 (size: {len(content)/1024:.1f}KB)") - return content - else: - raise Exception("Downloaded content is not a valid PDF") - elif pdf_response.status == 403: - raise Exception("Access denied - may need institutional access or VPN") - elif pdf_response.status == 429: - raise Exception("Rate limited - need to slow down requests") - else: - raise Exception(f"PDF download failed: HTTP {pdf_response.status}") - elif response.status == 403: - raise Exception("Access to paper page denied - may need institutional access") - else: - raise Exception(f"Paper page access failed: HTTP {response.status}") - - except Exception as e: - self.logger.warning(f"Enhanced ACM download failed: {str(e)}") - # 如果增强方法失败,尝试简单方法 - return await self._download_acm_ccs_pdf_simple(url) - - async def _warm_up_acm_session(self): - """预热 ACM session,获取必要的 cookies""" - try: - headers = self._get_anti_crawler_headers() - - async with aiohttp.ClientSession(headers=headers) as session: - # 访问 ACM 主页 - await asyncio.sleep(random.uniform(1, 3)) - timeout = aiohttp.ClientTimeout(total=20, connect=10) - - async with session.get('https://dl.acm.org/', timeout=timeout) as response: - if response.status == 200: - print(f"🔥 ACM session 预热成功") - else: - print(f"⚠️ ACM session 预热失败: HTTP {response.status}") - - except Exception as e: - self.logger.debug(f"ACM session warm-up failed: {str(e)}") - - async def _download_acm_ccs_pdf_simple(self, url: str) -> Optional[bytes]: - """简单版ACM CCS PDF下载方法作为备用""" - try: - headers = self._get_pdf_download_headers() - - async with aiohttp.ClientSession(headers=headers) as session: - await asyncio.sleep(random.uniform(2, 5)) - - timeout = aiohttp.ClientTimeout(total=60, connect=20) - async with session.get(url, timeout=timeout) as response: - if response.status == 200: - content = await response.read() - if self._is_valid_pdf(content): - return content - - return None - - except Exception as e: - self.logger.warning(f"Simple ACM download failed: {str(e)}") - return None - - async def _download_with_retry(self, url: str) -> Optional[bytes]: - """带重试机制的下载 - 修复版本""" - if not url: - return None - - # 对Computer.org的SP论文使用特殊处理 - if 'computer.org/csdl/pds/api' in url: - return await self._download_computer_org_pdf(url) - - # 对ACM CCS论文使用特殊处理 - if 'dl.acm.org/doi/pdf/' in url: - return await self._download_acm_ccs_pdf(url) - - # 其他链接使用原有方法 - return await self._download_with_aiohttp(url) - - async def _download_acm_ccs_pdf(self, url: str) -> Optional[bytes]: - """专门为ACM CCS PDF下载的方法 - 使用curl绕过保护机制""" - # 尝试多次下载,增加成功率 - for attempt in range(3): - result = await self._download_with_curl(url) - if result: - return result - # 添加延迟 - import asyncio - import random - delay = random.uniform(5, 10) - await asyncio.sleep(delay) - return None - - async def _download_computer_org_pdf(self, api_url: str) -> Optional[bytes]: - """专门为Computer.org PDF下载的方法 - 使用curl""" - return await self._download_with_curl(api_url) - - async def _download_with_curl(self, url: str) -> Optional[bytes]: - """使用curl命令下载,绕过ACM保护机制""" - try: - import asyncio - import subprocess - import random - import re - - # 提取DOI用于Referer头 - doi_match = re.search(r'/doi/pdf/(.+)', url) - referer = f"https://dl.acm.org/doi/{doi_match.group(1)}" if doi_match else url - - # 随机选择User-Agent,模拟不同浏览器 - user_agents = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36 Edg/139.0.0.0', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36', - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36' - ] - - # 构建curl命令,模拟真实浏览器请求 - cmd = [ - 'curl', - '-L', # 跟随重定向 - '-s', # 静默模式 - '--max-time', '60', # 最大超时时间 - '--user-agent', random.choice(user_agents), - '--header', 'Accept: application/pdf,application/octet-stream,*/*;q=0.8', - '--header', f'Referer: {referer}', - '--header', 'Accept-Language: en-US,en;q=0.9', - '--header', 'Accept-Encoding: gzip, deflate, br', - '--header', 'Connection: keep-alive', - '--header', 'Upgrade-Insecure-Requests: 1', - '--header', 'Sec-Fetch-Dest: document', - '--header', 'Sec-Fetch-Mode: navigate', - '--header', 'Sec-Fetch-Site: same-origin', - '--header', 'Cache-Control: max-age=0', - '--header', 'DNT: 1', - '--header', 'Sec-Ch-Ua: "Not;A=Brand";v="99", "Microsoft Edge";v="139", "Chromium";v="139"', - '--header', 'Sec-Ch-Ua-Mobile: ?0', - '--header', 'Sec-Ch-Ua-Platform: "Windows"', - url - ] - - # 使用subprocess异步执行curl - process = await asyncio.create_subprocess_exec( - *cmd, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE - ) - - stdout, stderr = await process.communicate() - - if process.returncode == 0 and stdout and len(stdout) > 1024: - # 验证PDF文件类型 - if self._is_valid_pdf(stdout): - return stdout - else: - return None - else: - error_msg = stderr.decode('utf-8', errors='ignore') if stderr else 'Unknown error' - - except Exception as e: - pass - - return None - - async def _download_with_aiohttp(self, url: str) -> Optional[bytes]: - """原有的aiohttp下载方法""" - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'application/pdf,application/octet-stream,*/*', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive' - } - - for attempt in range(self.max_retries): - try: - # 为每次重试创建新的Session,避免连接问题 - connector = aiohttp.TCPConnector( - limit=5, - limit_per_host=2, - force_close=True, # 强制关闭连接避免复用问题 - enable_cleanup_closed=True - ) - - timeout = aiohttp.ClientTimeout( - total=90, - connect=30, - sock_read=60 - ) - - async with aiohttp.ClientSession( - headers=headers, - timeout=timeout, - connector=connector - ) as session: - async with session.get(url) as response: - if response.status == 200: - content = await response.read() - if len(content) > 0: - return content - else: - self.logger.warning(f"Empty content from {url}") - elif response.status in [429, 503, 502]: - self.logger.warning( - f"Server busy (attempt {attempt + 1}/{self.max_retries}): " - f"HTTP {response.status} for {url}" - ) - await asyncio.sleep(self.retry_delay * (attempt + 1) * 2) - continue - else: - self.logger.warning( - f"Download failed (attempt {attempt + 1}/{self.max_retries}): " - f"HTTP {response.status} for {url}" - ) - - except asyncio.TimeoutError: - self.logger.warning( - f"Download timeout (attempt {attempt + 1}/{self.max_retries}): {url}" - ) - except Exception as e: - self.logger.warning( - f"Download error (attempt {attempt + 1}/{self.max_retries}): {str(e)} for {url}" - ) - - # 重试间隔 - if attempt < self.max_retries - 1: - delay = self.retry_delay * (attempt + 1) - await asyncio.sleep(delay) - - self.logger.error(f"Failed to download after {self.max_retries} attempts: {url}") - return None - - def _sanitize_filename(self, filename: str) -> str: - """清理文件名""" - # 移除不允许的字符 - invalid_chars = '<>:"/\\|?*' - for char in invalid_chars: - filename = filename.replace(char, '') - - # 将空格替换为下划线 - filename = filename.replace(' ', '_') - - # 限制长度 - max_length = 255 - len('.pdf') - if len(filename) > max_length: - filename = filename[:max_length] - - return filename.strip('._') - - def _is_valid_pdf(self, content: bytes) -> bool: - """验证是否为有效的PDF文件""" - if not content or len(content) < 4: - return False - - # 检查PDF文件头(%PDF-) - return content.startswith(b'%PDF-') - - diff --git a/src/paperbot/utils/keyword_optimizer.py b/src/paperbot/utils/keyword_optimizer.py deleted file mode 100644 index 4c33e330..00000000 --- a/src/paperbot/utils/keyword_optimizer.py +++ /dev/null @@ -1,491 +0,0 @@ -""" -关键词优化器 - -参考: BettaFish/MediaEngine/tools/ -适配: PaperBot 学者追踪 - 提升 Semantic Scholar 查询命中率 - -功能: -- 查询扩展 (同义词、相关术语) -- 查询重写 (LLM 辅助) -- 安全领域术语优化 -""" - -from typing import List, Dict, Any, Optional, Set -from dataclasses import dataclass, field -from loguru import logger - - -# ===== 安全领域术语库 ===== - -SECURITY_SYNONYMS: Dict[str, List[str]] = { - # 攻击类型 - "vulnerability": ["security flaw", "weakness", "exploit", "bug", "CVE"], - "malware": ["virus", "trojan", "ransomware", "worm", "spyware", "rootkit"], - "phishing": ["social engineering", "spear phishing", "whaling"], - "ddos": ["denial of service", "distributed denial of service", "DoS attack"], - "injection": ["SQL injection", "code injection", "command injection", "XSS"], - "overflow": ["buffer overflow", "stack overflow", "heap overflow", "memory corruption"], - - # 防御技术 - "detection": ["intrusion detection", "anomaly detection", "threat detection"], - "encryption": ["cryptography", "cipher", "AES", "RSA", "TLS", "SSL"], - "authentication": ["identity verification", "MFA", "2FA", "biometrics"], - "firewall": ["network security", "packet filtering", "WAF"], - "antivirus": ["anti-malware", "endpoint protection", "EDR"], - - # 研究领域 - "fuzzing": ["fuzz testing", "mutation testing", "coverage-guided fuzzing", "AFL"], - "binary analysis": ["reverse engineering", "disassembly", "decompilation"], - "program analysis": ["static analysis", "dynamic analysis", "taint analysis"], - "formal verification": ["model checking", "theorem proving", "symbolic execution"], - "machine learning security": ["adversarial ML", "ML robustness", "AI security"], - - # 会议/标准 - "top venue": ["S&P", "CCS", "USENIX Security", "NDSS"], - "IEEE S&P": ["Oakland", "IEEE Symposium on Security and Privacy"], - "CCS": ["ACM CCS", "ACM Conference on Computer and Communications Security"], - "USENIX Security": ["USENIX Security Symposium"], - "NDSS": ["Network and Distributed System Security"], -} - -# 常见缩写展开 -ABBREVIATION_EXPANSIONS: Dict[str, str] = { - "ML": "machine learning", - "DL": "deep learning", - "AI": "artificial intelligence", - "NLP": "natural language processing", - "IoT": "Internet of Things", - "ICS": "industrial control systems", - "SCADA": "supervisory control and data acquisition", - "APT": "advanced persistent threat", - "CVE": "Common Vulnerabilities and Exposures", - "CTF": "capture the flag", - "PWN": "binary exploitation", - "RE": "reverse engineering", - "ROP": "return-oriented programming", - "ASLR": "address space layout randomization", - "DEP": "data execution prevention", - "CFI": "control flow integrity", - "SGX": "Software Guard Extensions", - "TEE": "trusted execution environment", - "TLS": "Transport Layer Security", - "PKI": "public key infrastructure", -} - - -# ===== 数据结构 ===== - -@dataclass -class OptimizedQuery: - """优化后的查询""" - original: str - optimized: str - expansions: List[str] = field(default_factory=list) - score: float = 1.0 - metadata: Dict[str, Any] = field(default_factory=dict) - - -# ===== 关键词优化器 ===== - -class KeywordOptimizer: - """ - 关键词优化器 - - 提供查询扩展和优化功能,提升搜索命中率 - """ - - def __init__( - self, - synonyms: Optional[Dict[str, List[str]]] = None, - abbreviations: Optional[Dict[str, str]] = None, - llm_client: Any = None, - ): - """ - 初始化优化器 - - Args: - synonyms: 自定义同义词词典 - abbreviations: 自定义缩写词典 - llm_client: LLM 客户端 (用于高级重写) - """ - self.synonyms = {**SECURITY_SYNONYMS, **(synonyms or {})} - self.abbreviations = {**ABBREVIATION_EXPANSIONS, **(abbreviations or {})} - self.llm_client = llm_client - - def expand_abbreviations(self, query: str) -> str: - """ - 展开查询中的缩写 - - Args: - query: 原始查询 - - Returns: - 展开后的查询 - """ - words = query.split() - expanded = [] - - for word in words: - upper_word = word.upper() - if upper_word in self.abbreviations: - # 保留原词并添加展开形式 - expanded.append(f"{word} ({self.abbreviations[upper_word]})") - else: - expanded.append(word) - - return " ".join(expanded) - - def get_synonyms(self, term: str) -> List[str]: - """ - 获取术语的同义词 - - Args: - term: 术语 - - Returns: - 同义词列表 - """ - term_lower = term.lower() - - # 直接匹配 - if term_lower in self.synonyms: - return self.synonyms[term_lower] - - # 部分匹配 - results = [] - for key, values in self.synonyms.items(): - if term_lower in key or key in term_lower: - results.extend(values) - for value in values: - if term_lower in value.lower(): - results.append(key) - break - - return list(set(results)) - - def expand_query(self, query: str, max_expansions: int = 3) -> OptimizedQuery: - """ - 扩展查询 - - Args: - query: 原始查询 - max_expansions: 最大扩展数量 - - Returns: - 优化后的查询 - """ - # 1. 展开缩写 - expanded = self.expand_abbreviations(query) - - # 2. 收集同义词 - words = query.lower().split() - all_synonyms: Set[str] = set() - - for word in words: - syns = self.get_synonyms(word) - all_synonyms.update(syns[:max_expansions]) - - # 3. 构建扩展查询 - expansions = list(all_synonyms)[:max_expansions] - - if expansions: - expansion_str = " OR ".join(f'"{e}"' for e in expansions) - optimized = f"({expanded}) OR ({expansion_str})" - else: - optimized = expanded - - return OptimizedQuery( - original=query, - optimized=optimized, - expansions=expansions, - ) - - def optimize_for_semantic_scholar(self, query: str) -> OptimizedQuery: - """ - 针对 Semantic Scholar API 优化查询 - - Args: - query: 原始查询 - - Returns: - 优化后的查询 - """ - # Semantic Scholar 搜索技巧: - # - 使用引号进行精确匹配 - # - 简洁明了的关键词效果更好 - # - 避免过长的查询 - - # 1. 基本清理 - cleaned = query.strip() - - # 2. 检测并保留已有引号 - if '"' in cleaned: - # 用户已经使用了精确匹配,保持原样 - optimized = cleaned - expansions = [] - else: - # 3. 识别核心术语 - core_terms = self._extract_core_terms(cleaned) - - # 4. 展开缩写但不过度扩展 - expanded_terms = [] - expansions = [] - - for term in core_terms: - upper_term = term.upper() - if upper_term in self.abbreviations: - expanded_terms.append(self.abbreviations[upper_term]) - expansions.append(self.abbreviations[upper_term]) - else: - expanded_terms.append(term) - - optimized = " ".join(expanded_terms) - - return OptimizedQuery( - original=query, - optimized=optimized, - expansions=expansions, - metadata={"target": "semantic_scholar"}, - ) - - def _extract_core_terms(self, query: str) -> List[str]: - """提取核心术语""" - # 简单的停用词过滤 - stop_words = { - "a", "an", "the", "in", "on", "at", "to", "for", "of", - "and", "or", "but", "is", "are", "was", "were", "be", - "with", "by", "from", "as", "into", "through", "during", - "before", "after", "above", "below", "between", "under", - } - - words = query.lower().split() - core = [w for w in words if w not in stop_words and len(w) > 2] - - return core if core else words - - def generate_search_variants(self, query: str, num_variants: int = 3) -> List[OptimizedQuery]: - """ - 生成多个搜索变体 - - Args: - query: 原始查询 - num_variants: 变体数量 - - Returns: - 查询变体列表 - """ - variants = [] - - # 变体1: 原始查询 - variants.append(OptimizedQuery( - original=query, - optimized=query, - score=1.0, - metadata={"type": "original"}, - )) - - # 变体2: 缩写展开 - expanded = self.expand_abbreviations(query) - if expanded != query: - variants.append(OptimizedQuery( - original=query, - optimized=expanded, - score=0.9, - metadata={"type": "abbreviation_expanded"}, - )) - - # 变体3: 同义词扩展 - syn_query = self.expand_query(query) - if syn_query.optimized != query: - syn_query.score = 0.8 - syn_query.metadata["type"] = "synonym_expanded" - variants.append(syn_query) - - # 变体4: 精确匹配 (如果查询足够短) - if len(query.split()) <= 4: - variants.append(OptimizedQuery( - original=query, - optimized=f'"{query}"', - score=0.7, - metadata={"type": "exact_match"}, - )) - - return variants[:num_variants] - - async def rewrite_with_llm(self, query: str, context: Optional[str] = None) -> OptimizedQuery: - """ - 使用 LLM 重写查询 - - Args: - query: 原始查询 - context: 额外上下文 - - Returns: - 重写后的查询 - """ - if not self.llm_client: - logger.warning("LLM 客户端未配置,返回原始查询") - return OptimizedQuery(original=query, optimized=query) - - prompt = f"""你是一个学术搜索专家。请将以下查询优化为更适合在学术数据库(如 Semantic Scholar)中搜索的形式。 - -原始查询: {query} -{f"上下文: {context}" if context else ""} - -要求: -1. 使用标准学术术语 -2. 保持简洁(不超过6个关键词) -3. 去除无关词汇 -4. 如有必要,展开缩写 - -只返回优化后的查询,不要解释。""" - - try: - response = await self.llm_client.invoke_async( - messages=[{"role": "user", "content": prompt}] - ) - optimized = response.strip().strip('"') - - return OptimizedQuery( - original=query, - optimized=optimized, - metadata={"type": "llm_rewritten"}, - ) - except Exception as e: - logger.error(f"LLM 重写失败: {e}") - return OptimizedQuery(original=query, optimized=query) - - -# ===== 安全论文查询构建器 ===== - -class SecurityPaperQueryBuilder: - """ - 安全论文查询构建器 - - 专门用于构建安全领域的学术搜索查询 - """ - - TOP_VENUES = [ - "IEEE S&P", - "CCS", - "USENIX Security", - "NDSS", - ] - - ATTACK_CATEGORIES = { - "web": ["XSS", "CSRF", "SQL injection", "SSRF", "web vulnerability"], - "network": ["DDoS", "man-in-the-middle", "network attack", "traffic analysis"], - "system": ["buffer overflow", "kernel exploit", "privilege escalation", "rootkit"], - "mobile": ["Android security", "iOS security", "mobile malware", "app vulnerability"], - "iot": ["IoT security", "smart device", "embedded security", "firmware"], - "ml": ["adversarial", "model poisoning", "backdoor attack", "evasion attack"], - } - - DEFENSE_CATEGORIES = { - "detection": ["intrusion detection", "anomaly detection", "malware detection"], - "prevention": ["access control", "sandboxing", "isolation", "mitigation"], - "analysis": ["static analysis", "dynamic analysis", "fuzzing", "symbolic execution"], - "crypto": ["encryption", "authentication", "secure protocol", "key management"], - } - - def __init__(self, optimizer: Optional[KeywordOptimizer] = None): - """ - 初始化构建器 - - Args: - optimizer: 关键词优化器 - """ - self.optimizer = optimizer or KeywordOptimizer() - - def build_attack_query(self, attack_type: str, specific_terms: Optional[List[str]] = None) -> str: - """ - 构建攻击类型查询 - - Args: - attack_type: 攻击类别 (web, network, system, mobile, iot, ml) - specific_terms: 特定术语 - - Returns: - 查询字符串 - """ - base_terms = self.ATTACK_CATEGORIES.get(attack_type.lower(), [attack_type]) - all_terms = base_terms + (specific_terms or []) - - return " OR ".join(f'"{term}"' for term in all_terms) - - def build_defense_query(self, defense_type: str, specific_terms: Optional[List[str]] = None) -> str: - """ - 构建防御类型查询 - - Args: - defense_type: 防御类别 (detection, prevention, analysis, crypto) - specific_terms: 特定术语 - - Returns: - 查询字符串 - """ - base_terms = self.DEFENSE_CATEGORIES.get(defense_type.lower(), [defense_type]) - all_terms = base_terms + (specific_terms or []) - - return " OR ".join(f'"{term}"' for term in all_terms) - - def build_venue_filter(self, venues: Optional[List[str]] = None) -> str: - """ - 构建会议/期刊过滤 - - Args: - venues: 会议/期刊列表,默认为四大安全顶会 - - Returns: - venue 过滤字符串 - """ - target_venues = venues or self.TOP_VENUES - return " OR ".join(f'venue:"{v}"' for v in target_venues) - - def build_comprehensive_query( - self, - topic: str, - attack_type: Optional[str] = None, - defense_type: Optional[str] = None, - top_venues_only: bool = False, - year_range: Optional[str] = None, - ) -> str: - """ - 构建综合查询 - - Args: - topic: 主题 - attack_type: 攻击类别 - defense_type: 防御类别 - top_venues_only: 是否只搜索顶会 - year_range: 年份范围 - - Returns: - 综合查询字符串 - """ - parts = [topic] - - if attack_type: - parts.append(f"({self.build_attack_query(attack_type)})") - - if defense_type: - parts.append(f"({self.build_defense_query(defense_type)})") - - query = " AND ".join(parts) - - if top_venues_only: - query = f"({query}) AND ({self.build_venue_filter()})" - - return query - - -__all__ = [ - # 数据结构 - "OptimizedQuery", - # 优化器 - "KeywordOptimizer", - "SecurityPaperQueryBuilder", - # 常量 - "SECURITY_SYNONYMS", - "ABBREVIATION_EXPANSIONS", -] diff --git a/src/paperbot/utils/smart_downloader.py b/src/paperbot/utils/smart_downloader.py deleted file mode 100644 index 35333aaf..00000000 --- a/src/paperbot/utils/smart_downloader.py +++ /dev/null @@ -1,243 +0,0 @@ -# utils/smart_downloader.py - -import asyncio -import time -from typing import Dict, List, Any, Optional -from collections import deque -from dataclasses import dataclass -import logging -from paperbot.utils.downloader import PaperDownloader - -@dataclass -class DownloadStats: - """下载统计信息""" - total_attempts: int = 0 - successful_downloads: int = 0 - failed_downloads: int = 0 - cached_hits: int = 0 - avg_download_time: float = 0.0 - current_success_rate: float = 1.0 - consecutive_failures: int = 0 - -class SmartDownloadManager: - """智能下载管理器 - 动态调整并发数""" - - def __init__(self, config: Optional[Dict[str, Any]] = None): - self.config = config or {} - self.logger = logging.getLogger(__name__) - - # 创建基础下载器 - self.downloader = PaperDownloader(config) - - # 并发控制参数 - self.min_concurrent = 1 # 最小并发数 - self.max_concurrent = 4 # 最大并发数 - self.current_concurrent = 2 # 当前并发数 - - # 性能监控参数 - self.stats = DownloadStats() - self.recent_times = deque(maxlen=10) # 最近10次下载时间 - self.adjustment_threshold = 5 # 调整并发数的评估周期 - - # 安全参数 - self.failure_threshold = 0.3 # 失败率阈值 (30%) - self.slow_threshold = 10.0 # 慢下载阈值 (10秒) - self.rest_interval = 1.0 # 请求间隔 - - # 创建信号量 - self.semaphore = asyncio.Semaphore(self.current_concurrent) - - self.logger.info(f"智能下载管理器初始化 - 并发范围: {self.min_concurrent}-{self.max_concurrent}") - - async def download_papers_smart(self, papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """智能批量下载论文""" - if not papers: - return [] - - valid_papers = [p for p in papers if p.get('url') and p.get('url').strip()] - if not valid_papers: - self.logger.warning("没有找到有效的PDF下载链接") - return [] - - self.logger.info(f"🚀 开始智能下载 {len(valid_papers)} 篇论文") - self.logger.info(f"📊 初始并发数: {self.current_concurrent}") - - start_time = time.time() - results = [] - - # 分批处理,每批动态调整并发数 - batch_size = max(8, self.current_concurrent * 2) # 批次大小 - - for i in range(0, len(valid_papers), batch_size): - batch = valid_papers[i:i + batch_size] - batch_results = await self._process_batch(batch, i, len(valid_papers)) - results.extend(batch_results) - - # 动态调整并发数 - await self._adjust_concurrency() - - # 批次间休息 - if i + batch_size < len(valid_papers): - await asyncio.sleep(self.rest_interval) - - # 最终统计 - total_time = time.time() - start_time - self._print_final_stats(results, total_time) - - return results - - async def _process_batch(self, batch: List[Dict[str, Any]], start_idx: int, total: int) -> List[Dict[str, Any]]: - """处理一个批次的下载""" - self.logger.info(f"\n📦 处理批次 [{start_idx+1}-{min(start_idx+len(batch), total)}/{total}] - 并发数: {self.current_concurrent}") - - # 创建下载任务 - tasks = [] - for i, paper in enumerate(batch): - task = self._download_with_monitoring(paper, start_idx + i + 1, total) - tasks.append(task) - - # 执行批次下载 - results = await asyncio.gather(*tasks, return_exceptions=True) - - # 处理异常结果 - processed_results = [] - for result in results: - if isinstance(result, Exception): - self.logger.error(f"批次下载异常: {str(result)}") - processed_results.append({'success': False, 'error': str(result)}) - else: - processed_results.append(result) - - return processed_results - - async def _download_with_monitoring(self, paper: Dict[str, Any], index: int, total: int) -> Dict[str, Any]: - """带监控的单篇论文下载""" - async with self.semaphore: - start_time = time.time() - - try: - print(f"🔄 [{index}/{total}] 下载: {paper['title'][:50]}...") - - # 执行下载 - result = await self.downloader.download_paper(paper['url'], paper['title']) - download_time = time.time() - start_time - - # 更新统计信息 - self._update_stats(result, download_time) - - # 显示结果 - if result and result.get('success'): - if result.get('cached'): - print(f"📋 缓存命中 (耗时: {download_time:.1f}s)") - else: - size_kb = result.get('size', 0) / 1024 - print(f"✅ 下载成功 (耗时: {download_time:.1f}s, 大小: {size_kb:.1f}KB)") - else: - error_msg = result.get('error', '未知错误') if result else '下载失败' - print(f"❌ 下载失败: {error_msg}") - - return result or {'success': False, 'error': '下载失败'} - - except Exception as e: - download_time = time.time() - start_time - self._update_stats({'success': False}, download_time) - self.logger.error(f"下载异常 [{index}/{total}]: {str(e)}") - print(f"❌ 下载异常: {str(e)}") - return {'success': False, 'error': str(e)} - - def _update_stats(self, result: Dict[str, Any], download_time: float): - """更新下载统计信息""" - self.stats.total_attempts += 1 - - if result and result.get('success'): - if result.get('cached'): - self.stats.cached_hits += 1 - else: - self.stats.successful_downloads += 1 - self.recent_times.append(download_time) - self.stats.consecutive_failures = 0 - else: - self.stats.failed_downloads += 1 - self.stats.consecutive_failures += 1 - - # 计算成功率 - if self.stats.total_attempts > 0: - self.stats.current_success_rate = ( - self.stats.successful_downloads + self.stats.cached_hits - ) / self.stats.total_attempts - - # 计算平均下载时间 - if self.recent_times: - self.stats.avg_download_time = sum(self.recent_times) / len(self.recent_times) - - async def _adjust_concurrency(self): - """动态调整并发数""" - if self.stats.total_attempts < self.adjustment_threshold: - return # 样本太少,不调整 - - old_concurrent = self.current_concurrent - - # 决策逻辑 - if self.stats.consecutive_failures >= 3: - # 连续失败,降低并发 - self.current_concurrent = max(self.min_concurrent, self.current_concurrent - 1) - reason = f"连续失败{self.stats.consecutive_failures}次" - - elif self.stats.current_success_rate < self.failure_threshold: - # 成功率太低,降低并发 - self.current_concurrent = max(self.min_concurrent, self.current_concurrent - 1) - reason = f"成功率过低({self.stats.current_success_rate:.1%})" - - elif self.stats.avg_download_time > self.slow_threshold: - # 平均速度太慢,降低并发 - self.current_concurrent = max(self.min_concurrent, self.current_concurrent - 1) - reason = f"平均速度过慢({self.stats.avg_download_time:.1f}s)" - - elif (self.stats.current_success_rate > 0.8 and - self.stats.avg_download_time < 5.0 and - self.stats.consecutive_failures == 0): - # 表现良好,增加并发 - self.current_concurrent = min(self.max_concurrent, self.current_concurrent + 1) - reason = f"性能良好(成功率{self.stats.current_success_rate:.1%})" - else: - return # 保持当前并发数 - - # 如果并发数发生变化,更新信号量 - if old_concurrent != self.current_concurrent: - self.logger.info(f"🔧 调整并发数: {old_concurrent} → {self.current_concurrent} ({reason})") - - # 创建新的信号量 - self.semaphore = asyncio.Semaphore(self.current_concurrent) - - # 调整后稍作休息 - await asyncio.sleep(2.0) - - def _print_final_stats(self, results: List[Dict[str, Any]], total_time: float): - """打印最终统计信息""" - success_count = sum(1 for r in results if r.get('success')) - cached_count = sum(1 for r in results if r.get('success') and r.get('cached')) - download_count = success_count - cached_count - - print(f"\n🎉 智能下载完成统计:") - print(f"✅ 成功下载: {success_count}/{len(results)} 篇论文") - print(f"📋 缓存命中: {cached_count} 篇") - print(f"⬇️ 实际下载: {download_count} 篇") - print(f"⏱️ 总耗时: {total_time:.1f} 秒") - print(f"🔧 最终并发数: {self.current_concurrent}") - - if download_count > 0: - avg_time = total_time / download_count - print(f"📊 平均速度: {avg_time:.2f} 秒/篇") - - success_rate = success_count / len(results) if results else 0 - print(f"📈 成功率: {success_rate:.1%}") - - # 性能总结 - if success_rate >= 0.9: - print(f"🏆 下载性能: 优秀") - elif success_rate >= 0.8: - print(f"👍 下载性能: 良好") - elif success_rate >= 0.6: - print(f"⚠️ 下载性能: 一般") - else: - print(f"❌ 下载性能: 需要优化") \ No newline at end of file diff --git a/tests/unit/test_utils_cleanup_contracts.py b/tests/unit/test_utils_cleanup_contracts.py new file mode 100644 index 00000000..a90bffc1 --- /dev/null +++ b/tests/unit/test_utils_cleanup_contracts.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import ast +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[2] +UTILS_ROOT = ROOT / "src" / "paperbot" / "utils" +SCAN_ROOTS = ( + ROOT / "main.py", + ROOT / "src", + ROOT / "tests", + ROOT / "cli", +) +REMOVED_FILES = ( + "CCS-DOWN.py", + "acm_extractor.py", + "conference_downloader.py", + "conference_helpers.py", + "conference_parsers.py", + "conference_parsers_new.py", + "downloader - ccs.py", + "downloader_back.py", + "keyword_optimizer.py", + "smart_downloader.py", +) +REMOVED_MODULES = ( + "acm_extractor", + "conference_downloader", + "conference_helpers", + "conference_parsers", + "conference_parsers_new", + "downloader_back", + "keyword_optimizer", + "smart_downloader", +) + + +def _iter_python_files(): + for path in SCAN_ROOTS: + if path.is_file(): + yield path + continue + yield from sorted(path.rglob("*.py")) + + +def test_orphaned_utils_variants_are_removed() -> None: + for filename in REMOVED_FILES: + assert not (UTILS_ROOT / filename).exists(), filename + + +def test_repo_does_not_import_removed_utils_modules() -> None: + removed_imports = {f"paperbot.utils.{name}" for name in REMOVED_MODULES} + for path in _iter_python_files(): + tree = ast.parse(path.read_text(encoding="utf-8-sig"), filename=str(path)) + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + assert alias.name not in removed_imports, f"{path.relative_to(ROOT)}: import {alias.name}" + elif isinstance(node, ast.ImportFrom) and node.module: + assert node.module not in removed_imports, ( + f"{path.relative_to(ROOT)}: from {node.module} import ..." + ) + + +def test_utils_directory_has_no_backup_or_invalid_python_filenames() -> None: + invalid = [] + + for path in UTILS_ROOT.glob("*.py"): + name = path.name + if " " in name or name != name.lower(): + invalid.append(name) + if name.endswith("_back.py") or name.endswith("_new.py"): + invalid.append(name) + + assert invalid == []