File size: 3,387 Bytes
7f22d3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
同步包装器 - 为了向后兼容,提供同步接口
"""
import asyncio
import logging
from typing import Optional, Dict
from .crawler import AsyncCrawler

logger = logging.getLogger(__name__)


class SyncCrawlerWrapper:
    """
    同步爬虫包装器 - 包装AsyncCrawler以提供同步接口
    兼容原有的SmartCrawler.parse()接口
    """
    
    def __init__(self, **kwargs):
        """
        初始化同步包装器
        
        Args:
            **kwargs: 传递给AsyncCrawler的参数
        """
        self.async_crawler = AsyncCrawler(**kwargs)
        self._loop = None
    
    def parse(self, url: str) -> Optional[Dict]:
        """
        同步接口 - 兼容SmartCrawler.parse()
        
        Args:
            url: 要爬取的URL
            
        Returns:
            爬取结果字典,格式:{"url": str, "texts": List[str], "images": List[str], "links": List[str]}
            如果失败则返回None
        """
        logger.debug(f"SyncCrawlerWrapper.parse() called for: {url}")
        try:
            # 检查是否在已有事件循环的上下文中
            try:
                loop = asyncio.get_running_loop()
                logger.debug(f"Running in existing event loop context, using thread pool")
                # 如果有运行中的循环,在新线程中运行
                import concurrent.futures
                
                def run_in_new_loop():
                    """在新的事件循环中运行(在新线程中)"""
                    logger.debug(f"Creating new event loop in thread")
                    new_loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(new_loop)
                    try:
                        logger.debug(f"Running async crawler for: {url}")
                        results = new_loop.run_until_complete(self.async_crawler.run([url]))
                        logger.debug(f"Async crawler completed, got {len(results) if results else 0} results")
                        return results
                    finally:
                        logger.debug(f"Closing event loop")
                        new_loop.close()
                
                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                    future = executor.submit(run_in_new_loop)
                    results = future.result(timeout=120)  # 120秒超时
                    return results[0] if results else None
                    
            except RuntimeError:
                # 没有运行中的循环,直接运行
                logger.debug(f"No running event loop, using asyncio.run()")
                results = asyncio.run(self.async_crawler.run([url]))
                return results[0] if results else None
                
        except Exception as e:
            logger.error(f"Error in parse({url}): {e}")
            import traceback
            traceback.print_exc()
            return None
    
    def __getattr__(self, name):
        """代理其他方法到异步爬虫"""
        return getattr(self.async_crawler, name)
    
    def __del__(self):
        """清理资源"""
        try:
            # 尝试关闭异步爬虫
            if hasattr(self.async_crawler, 'executor'):
                self.async_crawler.executor.shutdown(wait=False)
        except Exception:
            pass