Movatterモバイル変換


[0]ホーム

URL:


Skip to content
DEV Community
Log in Create account

DEV Community

drake
drake

Posted on

采集百度指数

"""百度指数爬虫 2025年3月"""importjsonfromtracebackimportformat_excimporttimeimportrequestsfromdatetimeimportdatetime,timedeltaimportnumpyasnpimportpandasaspdimportrandomfromrequests.exceptionsimportRequestExceptioncookies_dict={"BAIDUID_BFESS":"03C87F7E8DAB230EF3CF68E2E4CCB7AC:FG=1","Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc":"1740900295","HMACCOUNT":"70309D756AB7564A","ppfuid":"FOCoIC3q5fKa8fgJnwzbE0LGziLN3VHbX8wfShDP6RCsfXQp/69CStRUAcn/QmhIlFDxPrAc/s5tJmCocrihdwitHd04Lvs3Nfz26Zt2holplnIKVacidp8Sue4dMTyfg65BJnOFhn1HthtSiwtygiD7piS4vjG/W9dLb1VAdqNDdL9XRrl2Sg9NTB85NN+3O0V6uxgO+hV7+7wZFfXG0MSpuMmh7GsZ4C7fF/kTgmssH+sfZC32dB1R3HtMdot/48PoBcDFpTKpfBzr/OZicPkAszoKx6tIpFl6mGV2OCZLSjlTHjWa30fvbP8FZaaPM+RpBohGNhMcqCHhVhtXpVObaDCHgWJZH3ZrTGYHmi7XJB9z3y2o8Kqxep5XBCsugNOW5C73e/g54kuY4PKIS8TtlheGhftBTbUILzt33xSjQXz/gJEgSYx1vUQMipXdSecr9yhMSRLVoFktEC1isB71ZYfNRps7I7heVMQ6naCvK/S9Ff5RtLDcahg8QCqqP/JUZA7BRBFh68uqDQax10gfXgGxCNf3Sx8e4KXUBrqV/g3hEEf9luu8oPziRIwanIJY1XZupqPZgmfh8BLwT9YUuyc0u8RKTitzO23hSwGX7sI4U3M5cfLBwVX5m74NveYUNi7Li87S8ZbXy31eyxBDK4IiDGlt1VFsxDIz0RsVHZudegSJ4zYa95fLOW41HdqdlVsa4ORVPwaoYgWzWigT4KUSvejPWWbczD37o0JAMY0Xq/mt7JbC+fPJzgUfL+4+FMXDRk2cSv9vAGlESpYON8OX4n9+9Iiz1Xhbaw4n3CtUvSh71zjHSVbOXva7HJMc4xeuRg7bfpEY/vwboa87Mf4DRxb3AAPFSzwHIQsKUb2NhurFXPHTBQ0ZqOMmlY+ev7ywybLL8HzYMUKf7xXkuNYCZBWkNbmLJnCAaUcxvvi236pnhRAiCpqFQgkNJGbjymaHirV01jGyjdICWIu01rzx5KJW22MzZ0c8aSEaiiS5MGq2rHDxd+cheyqXoKDbFUOPsQE72/a0kEWC2KhuPKLM9/6dZ00isWP1M71YVK+GcriYXdSGsdTLua2Z4rsiMpSciOy0GtH0BDIaHROBNUIGus13vk3BD9zddjzj9ZJseUlzwEV+bscicwIjSCwQvM4e3xnzVzlld+zvYN0q7Yw+xx5u95PSoz+nO88s9TqjpS2CuGXeoK3JV0ZsrYL63KbB6FE0u0LGhMX2XqphVNhJG/707P2GcCYlcR4=","BDUSS":"ndtMC1BVXpUdFF6Z3NGTHVCZlRmTXJVSXN3WUZXSkpTUXg5NU1iZGRCNlJsZXRuSVFBQUFBJCQAAAAAAAAAAAEAAAAK5OqRyqLH6cPOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJEIxGeRCMRnV","SIGNIN_UC":"70a2711cf1d3d9b1a82d2f87d633bd8a04909129477oUCDVgDuFIWQq0I5Qh%2BRYVI211tn6l6RtVzEcHQHKEeV3UqjZrdEN2J58qMq3yI6SH4Pf5yaO3wsYp7rDl7owf8Vxw8nV7J6HDde92bSfNLB%2BLYSDn8mcJUeJZ9DvByeHYEh5HZIEmBqjEW9Kp1nhY39kd0%2FMKxlLMEZywpREcfeqkBuNDImGT1swOphjUr0m7yoFRuoRONhZO0DhIUp8qMp%2BI%2BGZ9URB2%2FDv3g%2FwZ0nXnjrScjtkdIga7hBOF4Os4RsqXflHU7INYd10uoQ2Ecn99qPwcD5%2BuKJ7%2BtRR94%3D59476045678983651647832308115528","__cas__rn__":"490912947","__cas__st__212":"eb761014eef2d40b7c2bad261c6098e983f5ecc9a335b83a0f261f17f01cab78142cd91d640c0bff5197630b","__cas__id__212":"40927145","CPTK_212":"1776632285","CPID_212":"40927145","Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc":"1740900504","BDUSS_BFESS":"ndtMC1BVXpUdFF6Z3NGTHVCZlRmTXJVSXN3WUZXSkpTUXg5NU1iZGRCNlJsZXRuSVFBQUFBJCQAAAAAAAAAAAEAAAAK5OqRyqLH6cPOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJEIxGeRCMRnV","bdindexid":"jcohnli710phtu4po08tnl0o33","ab_sr":"1.0.1_MTk4MzQ0NWIxNmJjZGNiODQzM2I2OGU2MjY0M2ExODNhZmMwMjY4MjQ5YzJkM2IzZTZjNTc1ODA1ZDdkNmFhM2QyZDMyNWZkMWRmNGMyZmNjYjFiMTJiYzgwMzc5YzA5ZDVkM2U1M2ZiNTdkN2VlZWY0ODZiNmE1MTg3Y2YzZjVhYWU5ZDZhZmIyMGY2ZWQwMzM5ODM2ZTI3ODY5Nzk5ZQ==","RT":"z=1&dm=baidu.com&si=f3786353-627c-486d-b8e5-ea5d83dc0735&ss=m7rb5t0k&sl=6&tt=p5z&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=6czl"}credential={"cipherText":"1740888411355_1740901282164_aLrE9Za0dpKtlO3CQw1IR/Yz3hP8cGXzHI/2BnYqUk5XRMPS4pr5kfk3slC7+G60AS9KjhhlCPNuQnqbFhpZS9Z7MUVTxUeQ8XlgGhrmV+FapK3+nQuTdrm1pz8Jy5qhWO0pOhQyUqv/AR5RFI0hKsasKjMYDQfng+XPMhygTo0rCb3PLrFDflBQ1riNlJ7Bg8s6TfsE3OMaJPAQsjhaZlZO1bXUAhFIY0EMqIxq2DAkMVEatrHKmDbkb0f2NJw988jZkhDEZTAJ06iAXqSLbKnbF0bPCUIqaT/a5yeqr2KtCwbJYH4flHQSoThN40a6t/XiyTqUc1Mdds6w27Q/qOyR+nPe8978fEsEB3UssJ9LPc62xsjzLmY1x5qH7eA/j7eJAgbbWVvYW8H/4N3iaauKg0D1F8NqUHMGoGVpAQSj0/HLx5pUebCoFBVBnbA2kMYD8kvavD1WzPEMte2sp2uhlSGB4IIDMkqz13eaIsc=","cookie_BDUSS":cookies_dict['BDUSS']}defgenerate_http_headers(credential):http_headers={'Cookie':'BDUSS='+credential["cookie_BDUSS"],'Cipher-Text':credential["cipherText"],'Accept':'application/json, text/plain, */*','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9','Connection':'keep-alive','Referer':'https://index.baidu.com/v2/main/index.html','Host':'index.baidu.com','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}returnhttp_headersdefcalculate_yearly_averages(start_date,end_date,data_series):# Convert the start and end dates to datetime objectsstart=datetime.strptime(start_date,'%Y-%m-%d')end=datetime.strptime(end_date,'%Y-%m-%d')days_span=(end-start).days+1# Split the data series into a list and replace empty strings with '0'data_points=data_series.split(',')data_points=['0'ifpoint==''elsepointforpointindata_points]data_points=np.array(data_points,dtype=float)ifdays_span<=366:dates=pd.date_range(start,periods=len(data_points))else:weeks_span=len(data_points)dates=pd.date_range(start,periods=weeks_span,freq='W')# Create a DataFrame with the dates and data pointsdf=pd.DataFrame({'Date':dates,'Data':data_points})df.set_index('Date',inplace=True)# Calculate the yearly averageyearly_averages=df.resample('YE').mean().reset_index()yearly_averages['Year']=yearly_averages['Date'].dt.yearyearly_averages.drop('Date',axis=1,inplace=True)yearly_averages.rename(columns={'Data':'Average'},inplace=True)# Convert DataFrame to list of tuples (year, average)yearly_averages_list=list(yearly_averages.itertuples(index=False,name=None))print(yearly_averages_list)returnyearly_averages_list# 解密defdecrypt(ptbk,index_data):n=len(ptbk)//2a=dict(zip(ptbk[:n],ptbk[n:]))return"".join([a[s]forsinindex_data])defkeywords2json(keyword):importjsonconverted_keywords=[[{"name":keyword,"wordType":1}]]# Convert the list of lists of dictionaries into a JSON stringjson_string=json.dumps(converted_keywords,ensure_ascii=False)print(json_string)returnjson_string## def namely(keywords):#     return '+'.join(keywords)defcrawl_request(keyword,startDate,endDate,regionCode,credential,expectedInterval,autoSave,regionName,data_combine):print('正在查询:',keyword,startDate,endDate,regionCode)words=keywords2json(keyword)# 第一级以逗号分隔,第二级以加号分隔testwordset=keywordmax_retries=3# 最大重试次数retries=0# 当前重试次数whileretries<max_retries:try:url=f'https://index.baidu.com/api/AddWordApi/checkWordsExists?word={testwordset}'rsp=requests.get(url,headers=generate_http_headers(credential),timeout=10).json()# 若data的result不为空,则说明关键词不存在,报错并退出ifrsp['data']['result']:print(f'{testwordset}关键词不存在或组合里有不存在的关键词,请检查')return-1url=f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area={regionCode}&startDate={startDate}&endDate={endDate}'rsp=requests.get(url,headers=generate_http_headers(credential),timeout=10).json()# 获取解密秘钥data=rsp['data']['userIndexes']uniqid=rsp['data']['uniqid']url=f'https://index.baidu.com/Interface/ptbk?uniqid={uniqid}'ptbk=requests.get(url,headers=generate_http_headers(credential),timeout=10).json()['data']# 数据解密res=[0for_inrange(len(data))]foriinrange(len(data)):index_data=decrypt(ptbk,data[i]['all']['data'])yearly_averages=calculate_yearly_averages(startDate,endDate,index_data)fortuple_iteminyearly_averages:index_d=round(tuple_item[0],2)year=tuple_item[1]ifyear>2022:continueifyearindata_combine:data_combine[year].append(index_d)else:data_combine[year]=[year,regionName,index_d]returnresexceptExceptionase:print(f'请求失败,错误信息:{e}')retries+=1print(f'重试第{retries}次...')time.sleep(random.randint(1,3))# 在重试前等待一段时间ifretries==max_retries:print(f'请求失败次数过多,已达到最大重试次数{max_retries},跳过当前连接')return-1# regions = {}provinces={901:"山东",902:"贵州",903:"江西",904:"重庆",905:"内蒙古",906:"湖北",907:"辽宁",908:"湖南",909:"福建",910:"上海",911:"北京",912:"广西",913:"广东",914:"四川",915:"云南",916:"江苏",917:"浙江",918:"青海",919:"宁夏",920:"河北",921:"黑龙江",922:"吉林",923:"天津",924:"陕西",925:"甘肃",926:"新疆",927:"河南",928:"安徽",929:"山西",930:"海南",931:"台湾",# 932: "西藏",933:"香港",934:"澳门"}regions=provincesdefcrawl(regionCode,credential,expectedInterval,autoSave,regionName,data_combine):# 获取11年到22年的数据startDate='2011-01-01'endDate='2022-12-31'# 清洗关键词keywords=['第三方支付','在线支付','移动支付','网贷','互联网理财','互联网保险','在线理财','电子银行','网银','大数据','云计算','人工智能','区块链','生物识别']# res = {regionCode: []}forkeywordinkeywords:ifregionCode!='999':try:crawl_request(keyword,startDate,endDate,regionCode,credential,expectedInterval,autoSave,regionName,data_combine)except:print(format_exc())# res[regionCode].extend(t)# 每次查询后休息一到五秒,实际上在账号很多的情况下,这个时间可以缩短time.sleep(expectedInterval/1000+random.randint(1,3)/2)if__name__=='__main__':importcsv# # 清洗关键词# titles = ['年份', '区域', '第三方支付', '在线支付', '移动支付', '网贷', '互联网理财', '互联网保险', '在线理财', '电子银行', '网银',#             '大数据', '云计算', '人工智能',#             '区块链', '生物识别']# with open('combine_backup.csv', 'a', encoding='utf-8-sig', newline='') as csvfile:#     writer = csv.writer(csvfile)#     writer.writerow(titles)forregionCodeinregions:# regionCode = 928# regionName = '安徽'regionName=regions[regionCode]data_combine={}crawl(regionCode,credential,10,True,regionName,data_combine)data_list=[]foriindata_combine:data_list.append(data_combine[i])withopen('combine_backup.csv','a',encoding='utf-8-sig',newline='')ascsvfile:writer=csv.writer(csvfile)writer.writerows(data_list)
Enter fullscreen modeExit fullscreen mode

Top comments(0)

Subscribe
pic
Create template

Templates let you quickly answer FAQs or store snippets for re-use.

Dismiss

Are you sure you want to hide this comment? It will become hidden in your post, but will still be visible via the comment'spermalink.

For further actions, you may consider blocking this person and/orreporting abuse

生活唯一不可或缺的是学习
  • Location
    Dubai
  • Education
    master
  • Joined

More fromdrake

DEV Community

We're a place where coders share, stay up-to-date and grow their careers.

Log in Create account

[8]ページ先頭

©2009-2025 Movatter.jp