Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit6b2a802

Browse files
1. remove ebay search(not support)
2.more strong browser manager3.more diligent ctrl C exit
1 parent3da57dd commit6b2a802

File tree

10 files changed

+205
-184
lines changed

10 files changed

+205
-184
lines changed

‎core/async_database.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -446,7 +446,7 @@ def _get_utc_timestamp(self) -> str:
446446
asyncdefcache_url(self,result:CrawlResult):
447447
"""缓存URL数据 - 优化版本"""
448448
# 如果有重定向URL,使用重定向URL作为主键
449-
cache_url=result.redirected_urlifresult.redirected_urlelseresult.url
449+
cache_url=result.url
450450
ifnotcache_url:
451451
return
452452

‎core/general_process.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ async def wrapper():
219219
sources.append({'type':KUAISHOU_PLATFORM_NAME,'query': {focuspoint}})
220220
elifsearch_source=='github':
221221
tasks.add(wrap_task(search_with_github(focuspoint,existings['web']), ('posts','github')))
222-
elifsearch_sourcein ['ebay','bing','arxiv']andcrawlers.get('web'):
222+
elifsearch_sourcein ['bing','arxiv']andcrawlers.get('web'):
223223
tasks.add(wrap_task(search_with_engine(search_source,focuspoint,crawlers['web'],existings['web']),
224224
('article_or_posts',search_source)))
225225
else:

‎core/run_task.py‎

Lines changed: 107 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -16,75 +16,129 @@
1616
)
1717

1818
importasyncio
19+
importsignal
20+
importsys
1921
fromgeneral_processimportmain_process
2022
fromasync_loggerimportwis_logger
2123
fromasync_databaseimportAsyncDatabaseManager
2224
fromcustom_processesimportcrawler_config_map
2325

2426
loop_counter=0
27+
shutdown_event=asyncio.Event()
28+
29+
defsignal_handler(sig,frame):
30+
"""处理 SIGINT 信号 (Ctrl+C)"""
31+
wis_logger.debug(f"Received signal{sig}, initiating graceful shutdown...")
32+
shutdown_event.set()
33+
34+
asyncdefcleanup_resources(crawlers,db_manager):
35+
"""清理所有资源"""
36+
wis_logger.debug("Starting resource cleanup...")
37+
38+
try:
39+
# 清理 web 爬虫(浏览器资源)
40+
if"web"incrawlers:
41+
wis_logger.debug("Closing web crawler...")
42+
awaitcrawlers["web"].close()
43+
44+
# 清理数据库
45+
wis_logger.debug("Cleaning up database...")
46+
awaitdb_manager.cleanup()
47+
wis_logger.debug("Resource cleanup completed successfully")
48+
49+
exceptExceptionase:
50+
wis_logger.warning(f"Error during resource cleanup:{e}")
2551

2652
asyncdefschedule_task():
53+
# 设置信号处理器
54+
ifsys.platform!='win32':
55+
signal.signal(signal.SIGINT,signal_handler)
56+
signal.signal(signal.SIGTERM,signal_handler)
57+
2758
# initialize if any error, will raise exception
2859
db_manager=AsyncDatabaseManager()
2960
awaitdb_manager.initialize()
3061
crawlers= {}
31-
forplatforminALL_PLATFORMS:
32-
ifplatform==KUAISHOU_PLATFORM_NAME:
33-
try:
34-
ks_crawler=KuaiShouCrawler(db_manager=db_manager)
35-
awaitks_crawler.async_initialize()
36-
crawlers[KUAISHOU_PLATFORM_NAME]=ks_crawler
37-
exceptExceptionase:
38-
wis_logger.warning(f"initialize kuaishou crawler failed:{e}, will abort all the sources for kuaishou platform")
39-
elifplatform==WEIBO_PLATFORM_NAME:
40-
try:
41-
wb_crawler=WeiboCrawler(db_manager=db_manager)
42-
awaitwb_crawler.async_initialize()
43-
crawlers[WEIBO_PLATFORM_NAME]=wb_crawler
44-
exceptExceptionase:
45-
wis_logger.warning(f"initialize weibo crawler failed:{e}, will abort all the sources for weibo platform")
46-
elifplatform=='web':
62+
63+
try:
64+
forplatforminALL_PLATFORMS:
65+
ifplatform==KUAISHOU_PLATFORM_NAME:
66+
try:
67+
ks_crawler=KuaiShouCrawler(db_manager=db_manager)
68+
awaitks_crawler.async_initialize()
69+
crawlers[KUAISHOU_PLATFORM_NAME]=ks_crawler
70+
exceptExceptionase:
71+
wis_logger.warning(f"initialize kuaishou crawler failed:{e}, will abort all the sources for kuaishou platform")
72+
elifplatform==WEIBO_PLATFORM_NAME:
73+
try:
74+
wb_crawler=WeiboCrawler(db_manager=db_manager)
75+
awaitwb_crawler.async_initialize()
76+
crawlers[WEIBO_PLATFORM_NAME]=wb_crawler
77+
exceptExceptionase:
78+
wis_logger.warning(f"initialize weibo crawler failed:{e}, will abort all the sources for weibo platform")
79+
elifplatform=='web':
80+
try:
81+
web_crawler=AsyncWebCrawler(crawler_config_map=crawler_config_map,db_manager=db_manager)
82+
awaitweb_crawler.start()
83+
crawlers[platform]=web_crawler
84+
exceptExceptionase:
85+
wis_logger.warning(f"initialize web crawler failed:{e}, will abort all the sources for web platform and search engines")
86+
else:
87+
raiseValueError(f"platform{platform} not supported")
88+
89+
globalloop_counter
90+
wis_logger.info("All crawlers initialized successfully, starting main loop...")
91+
92+
whilenotshutdown_event.is_set():
4793
try:
48-
web_crawler=AsyncWebCrawler(crawler_config_map=crawler_config_map,db_manager=db_manager)
49-
awaitweb_crawler.start()
50-
crawlers[platform]=web_crawler
94+
wis_logger.info(f'task execute loop{loop_counter+1}')
95+
tasks=awaitdb_manager.get_activated_focus_points_with_sources()
96+
jobs= []
97+
fortaskintasks:
98+
focus=task['focus_point']
99+
sources=task['sources']
100+
ifnotfocus:
101+
continue
102+
ifnotfocus['freq']ornotfocus['focuspoint']:
103+
continue
104+
ifloop_counter%focus['freq']!=0:
105+
continue
106+
jobs.append(main_process(focus,sources,crawlers,db_manager))
107+
loop_counter+=1
108+
109+
ifjobs:
110+
awaitasyncio.gather(*jobs)
111+
112+
wis_logger.info('task execute loop finished, work after 3600 seconds')
113+
114+
# 使用 wait_for 来允许中断 sleep
115+
try:
116+
awaitasyncio.wait_for(shutdown_event.wait(),timeout=3600)
117+
break# 如果 shutdown_event 被设置,退出循环
118+
exceptasyncio.TimeoutError:
119+
continue# 超时后继续下一个循环
120+
121+
exceptasyncio.CancelledError:
122+
wis_logger.debug("Task cancelled, shutting down...")
123+
break
124+
exceptKeyboardInterrupt:
125+
wis_logger.debug("Received keyboard interrupt, shutting down...")
126+
break
51127
exceptExceptionase:
52-
wis_logger.warning(f"initialize web crawler failed:{e}, will abort all the sources for web platform and search engines")
53-
else:
54-
raiseValueError(f"platform{platform} not supported")
128+
wis_logger.warning(f"Unexpected error in main loop:{e}")
129+
# 不退出循环,继续处理
130+
131+
exceptExceptionase:
132+
wis_logger.warning(f"Critical error during initialization:{e}")
133+
finally:
134+
awaitcleanup_resources(crawlers,db_manager)
55135

56-
globalloop_counter
136+
if__name__=="__main__":
57137
try:
58-
whileTrue:
59-
wis_logger.info(f'task execute loop{loop_counter+1}')
60-
tasks=awaitdb_manager.get_activated_focus_points_with_sources()
61-
jobs= []
62-
fortaskintasks:
63-
focus=task['focus_point']
64-
sources=task['sources']
65-
ifnotfocus:
66-
continue
67-
ifnotfocus['freq']ornotfocus['focuspoint']:
68-
continue
69-
ifloop_counter%focus['freq']!=0:
70-
continue
71-
jobs.append(main_process(focus,sources,crawlers,db_manager))
72-
loop_counter+=1
73-
awaitasyncio.gather(*jobs)
74-
wis_logger.info('task execute loop finished, work after 3600 seconds')
75-
awaitasyncio.sleep(3600)
138+
asyncio.run(schedule_task())
76139
exceptKeyboardInterrupt:
77-
wis_logger.info("Received interrupt signal, shutting down...")
140+
wis_logger.debug("Program interrupted by user")
78141
exceptExceptionase:
79-
wis_logger.error(f"Unexpected error in main loop:{e}")
142+
wis_logger.warning(f"Program failed with error:{e}")
80143
finally:
81-
# 清理数据库资源
82-
try:
83-
if"web"incrawlers:
84-
awaitcrawlers["web"].close()
85-
awaitdb_manager.cleanup()
86-
wis_logger.debug("Database cleanup completed")
87-
exceptExceptionase:
88-
wis_logger.error(f"Database cleanup failed:{e}")
89-
90-
asyncio.run(schedule_task())
144+
wis_logger.debug("Program shutdown complete")

‎core/wis/async_webcrawler.py‎

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,11 @@ async def nullcontext(self):
113113

114114
asyncdefarun(self,url:str,config:CrawlerRunConfig=None,session_id:str=None)->Optional[RunManyReturn]:
115115
ifself.db_manager:
116-
cached_result=awaitself.db_manager.get_cached_url(url,days_threshold=30)
116+
ifurl.startswith("https://www.bing.com/search"):
117+
days_threshold=1
118+
else:
119+
days_threshold=30
120+
cached_result=awaitself.db_manager.get_cached_url(url,days_threshold=days_threshold)
117121
ifcached_resultandcached_result.html:
118122
wis_logger.debug(f"Get{url} from db cache")
119123
cached_result.session_id=session_id

‎core/wis/browser_manager.py‎

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -315,15 +315,17 @@ def _cleanup_expired_sessions(self):
315315
asyncio.create_task(self.kill_session(sid))
316316

317317
asyncdefclose(self):
318-
"""Close all browser resources and clean up."""
319318
ifself.config.sleep_on_close:
320319
awaitasyncio.sleep(0.5)
321320

322-
forcontextinself.contexts.values():
323-
awaitcontext.close()
321+
ifself.playwright:
322+
try:
323+
awaitself.playwright.stop()
324+
exceptExceptionase:
325+
ifself.logger:
326+
self.logger.warning(f"Error stopping playwright during cleanup:{e}")
327+
finally:
328+
self.playwright=None
329+
324330
self.contexts.clear()
325331
self.sessions.clear()
326-
327-
ifself.playwright:
328-
awaitself.playwright.stop()
329-
self.playwright=None

‎core/wis/mc_commen/tools/crawler_util.py‎

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -27,47 +27,6 @@ def show_qrcode(qr_code) -> None: # type: ignore
2727
draw.rectangle((0,0,width+19,height+19),outline=(0,0,0),width=1)
2828
new_image.show()
2929

30-
31-
defget_user_agent()->str:
32-
ua_list= [
33-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
34-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36",
35-
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
36-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
37-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36",
38-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36",
39-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
40-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.5112.79 Safari/537.36",
41-
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
42-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
43-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.5060.53 Safari/537.36",
44-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.4844.84 Safari/537.36",
45-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
46-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5112.79 Safari/537.36",
47-
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
48-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
49-
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5060.53 Safari/537.36",
50-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.4844.84 Safari/537.36",
51-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
52-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.5112.79 Safari/537.36"
53-
]
54-
returnrandom.choice(ua_list)
55-
56-
57-
defget_mobile_user_agent()->str:
58-
ua_list= [
59-
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
60-
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Mobile/15E148 Safari/604.1",
61-
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.99 Mobile/15E148 Safari/604.1",
62-
"Mozilla/5.0 (iPad; CPU OS 16_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/114.0.5735.124 Mobile/15E148 Safari/604.1",
63-
"Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36",
64-
"Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/110.0.5481.154 Mobile Safari/537.36",
65-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 OPR/99.0.0.0",
66-
"Mozilla/5.0 (Linux; Android 10; JNY-LX1; HMSCore 6.11.0.302) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.88 HuaweiBrowser/13.0.5.303 Mobile Safari/537.36"
67-
]
68-
returnrandom.choice(ua_list)
69-
70-
7130
defconvert_str_cookie_to_dict(cookie_str:str)->Dict:
7231
cookie_dict:Dict[str,str]=dict()
7332
ifnotcookie_str:

‎core/wis/searchengines/__init__.py‎

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,11 @@ async def search_with_engine(engine: str,
4646
request_params=engine_module.gen_request_params(query,**kwargs)
4747
method=request_params.get("method","GET").upper()
4848
url=request_params["url"]
49+
headers=request_params.get("headers", {})
4950
asyncwithhttpx.AsyncClient(timeout=60)asclient:
5051
forattemptinrange(3):
5152
try:
52-
response=awaitclient.request(method,url)
53+
response=awaitclient.request(method,url,headers=headers)
5354
response.raise_for_status
5455
break
5556
exceptExceptionase:
@@ -65,14 +66,15 @@ async def search_with_engine(engine: str,
6566
)
6667
return [],"", {}
6768
html=response.text
68-
else:
69+
70+
elifengine=="bing":
6971
url=engine_module.gen_query_url(query,**kwargs)
7072
result=awaitcrawler.arun(url)
7173
ifnotresultornotresult.success:
7274
wis_logger.warning(f"Search with Engine '{engine}', query '{query}', due to crawler, failed")
7375
return [],"", {}
7476
html=result.html
75-
77+
7678
try:
7779
search_results=engine_module.parse_response(html)
7880
exceptExceptionase:
@@ -83,45 +85,25 @@ async def search_with_engine(engine: str,
8385
markdown=""
8486
link_dict= {}
8587
forresultinsearch_results:
88+
link_url=result.get("url")
89+
ifnotlink_urlorlink_urlinexistings:
90+
continue
8691
ifengine=="bing":
8792
title=result.get("title","")
8893
content=result.get("content","")
8994
ifnottitleandnotcontent:
9095
continue
9196
key=f"[{len(link_dict)+1}]"
92-
link_dict[key]=url
97+
link_dict[key]=link_url
9398
markdown+=f"*{key}{title}\n"
9499
ifcontent:
95100
content=content.replace("\n"," ")
96101
markdown+=f"{content}{key}\n"
97102
markdown+="\n"
98-
99-
ifengine=="ebay":
100-
# for ebay engine, we treat the result as post list, need to generate the markdown and link_dict
101-
url=result.get("url")
102-
ifnoturlorurlinexistings:
103-
continue
104-
title=result.get("title","")or""
105-
title=title.replace("\n"," ")
106-
content=result.get("content","")or""
107-
content=content.replace("\n"," ")
108-
# test code
109-
ifcontent:
110-
wis_logger.warning(f'[UNEXPECTED]ebay{url} have content!{content}\nfrom query:{query}')
111-
price=result.get("price","")
112-
shipping=result.get("shipping","")
113-
source_country=result.get("source_country","")
114-
115-
key=f"[{len(link_dict)+1}]"
116-
link_dict[key]=url
117-
markdown+=f"*{key}{title}\nPrice:{price} Shipping:{shipping}\nSource Country:{source_country}{key}\n\n"
118103

119104
elifengine=="arxiv":
120105
# for arxiv engine, we have to treat the result as an article, because the url in the result is just the summary page
121106
# user should only use wiseflow as an information collector to find the potiencial interesting articles, and then use the url to get the full pdf
122-
url=result.get("url")
123-
ifnoturlorurlinexistings:
124-
continue
125107
title=result.get("title","")
126108
content=result.get("content","")or""
127109
content=content.replace("\n"," ")
@@ -142,7 +124,7 @@ async def search_with_engine(engine: str,
142124
comments=comments.replace("\n"," ")
143125
_markdown+=f"Comments:{comments}\n"
144126
_markdown+=content
145-
articles.append(CrawlResult(url=url,
127+
articles.append(CrawlResult(url=link_url,
146128
title=title,
147129
markdown=_markdown,
148130
author=authors,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp