SmartPagerankSearch / crawler_v2 /sync_wrapper.py
GitHub Action
Sync from GitHub Actions (Clean Commit)
7f22d3c
"""
同步包装器 - 为了向后兼容,提供同步接口
"""
import asyncio
import logging
from typing import Optional, Dict
from .crawler import AsyncCrawler
logger = logging.getLogger(__name__)
class SyncCrawlerWrapper:
"""
同步爬虫包装器 - 包装AsyncCrawler以提供同步接口
兼容原有的SmartCrawler.parse()接口
"""
def __init__(self, **kwargs):
"""
初始化同步包装器
Args:
**kwargs: 传递给AsyncCrawler的参数
"""
self.async_crawler = AsyncCrawler(**kwargs)
self._loop = None
def parse(self, url: str) -> Optional[Dict]:
"""
同步接口 - 兼容SmartCrawler.parse()
Args:
url: 要爬取的URL
Returns:
爬取结果字典,格式:{"url": str, "texts": List[str], "images": List[str], "links": List[str]}
如果失败则返回None
"""
logger.debug(f"SyncCrawlerWrapper.parse() called for: {url}")
try:
# 检查是否在已有事件循环的上下文中
try:
loop = asyncio.get_running_loop()
logger.debug(f"Running in existing event loop context, using thread pool")
# 如果有运行中的循环,在新线程中运行
import concurrent.futures
def run_in_new_loop():
"""在新的事件循环中运行(在新线程中)"""
logger.debug(f"Creating new event loop in thread")
new_loop = asyncio.new_event_loop()
asyncio.set_event_loop(new_loop)
try:
logger.debug(f"Running async crawler for: {url}")
results = new_loop.run_until_complete(self.async_crawler.run([url]))
logger.debug(f"Async crawler completed, got {len(results) if results else 0} results")
return results
finally:
logger.debug(f"Closing event loop")
new_loop.close()
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(run_in_new_loop)
results = future.result(timeout=120) # 120秒超时
return results[0] if results else None
except RuntimeError:
# 没有运行中的循环,直接运行
logger.debug(f"No running event loop, using asyncio.run()")
results = asyncio.run(self.async_crawler.run([url]))
return results[0] if results else None
except Exception as e:
logger.error(f"Error in parse({url}): {e}")
import traceback
traceback.print_exc()
return None
def __getattr__(self, name):
"""代理其他方法到异步爬虫"""
return getattr(self.async_crawler, name)
def __del__(self):
"""清理资源"""
try:
# 尝试关闭异步爬虫
if hasattr(self.async_crawler, 'executor'):
self.async_crawler.executor.shutdown(wait=False)
except Exception:
pass