drake
Posted on
采集百度指数
"""百度指数爬虫 2025年3月"""importjsonfromtracebackimportformat_excimporttimeimportrequestsfromdatetimeimportdatetime,timedeltaimportnumpyasnpimportpandasaspdimportrandomfromrequests.exceptionsimportRequestExceptioncookies_dict={"BAIDUID_BFESS":"03C87F7E8DAB230EF3CF68E2E4CCB7AC:FG=1","Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc":"1740900295","HMACCOUNT":"70309D756AB7564A","ppfuid":"FOCoIC3q5fKa8fgJnwzbE0LGziLN3VHbX8wfShDP6RCsfXQp/69CStRUAcn/QmhIlFDxPrAc/s5tJmCocrihdwitHd04Lvs3Nfz26Zt2holplnIKVacidp8Sue4dMTyfg65BJnOFhn1HthtSiwtygiD7piS4vjG/W9dLb1VAdqNDdL9XRrl2Sg9NTB85NN+3O0V6uxgO+hV7+7wZFfXG0MSpuMmh7GsZ4C7fF/kTgmssH+sfZC32dB1R3HtMdot/48PoBcDFpTKpfBzr/OZicPkAszoKx6tIpFl6mGV2OCZLSjlTHjWa30fvbP8FZaaPM+RpBohGNhMcqCHhVhtXpVObaDCHgWJZH3ZrTGYHmi7XJB9z3y2o8Kqxep5XBCsugNOW5C73e/g54kuY4PKIS8TtlheGhftBTbUILzt33xSjQXz/gJEgSYx1vUQMipXdSecr9yhMSRLVoFktEC1isB71ZYfNRps7I7heVMQ6naCvK/S9Ff5RtLDcahg8QCqqP/JUZA7BRBFh68uqDQax10gfXgGxCNf3Sx8e4KXUBrqV/g3hEEf9luu8oPziRIwanIJY1XZupqPZgmfh8BLwT9YUuyc0u8RKTitzO23hSwGX7sI4U3M5cfLBwVX5m74NveYUNi7Li87S8ZbXy31eyxBDK4IiDGlt1VFsxDIz0RsVHZudegSJ4zYa95fLOW41HdqdlVsa4ORVPwaoYgWzWigT4KUSvejPWWbczD37o0JAMY0Xq/mt7JbC+fPJzgUfL+4+FMXDRk2cSv9vAGlESpYON8OX4n9+9Iiz1Xhbaw4n3CtUvSh71zjHSVbOXva7HJMc4xeuRg7bfpEY/vwboa87Mf4DRxb3AAPFSzwHIQsKUb2NhurFXPHTBQ0ZqOMmlY+ev7ywybLL8HzYMUKf7xXkuNYCZBWkNbmLJnCAaUcxvvi236pnhRAiCpqFQgkNJGbjymaHirV01jGyjdICWIu01rzx5KJW22MzZ0c8aSEaiiS5MGq2rHDxd+cheyqXoKDbFUOPsQE72/a0kEWC2KhuPKLM9/6dZ00isWP1M71YVK+GcriYXdSGsdTLua2Z4rsiMpSciOy0GtH0BDIaHROBNUIGus13vk3BD9zddjzj9ZJseUlzwEV+bscicwIjSCwQvM4e3xnzVzlld+zvYN0q7Yw+xx5u95PSoz+nO88s9TqjpS2CuGXeoK3JV0ZsrYL63KbB6FE0u0LGhMX2XqphVNhJG/707P2GcCYlcR4=","BDUSS":"ndtMC1BVXpUdFF6Z3NGTHVCZlRmTXJVSXN3WUZXSkpTUXg5NU1iZGRCNlJsZXRuSVFBQUFBJCQAAAAAAAAAAAEAAAAK5OqRyqLH6cPOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJEIxGeRCMRnV","SIGNIN_UC":"70a2711cf1d3d9b1a82d2f87d633bd8a04909129477oUCDVgDuFIWQq0I5Qh%2BRYVI211tn6l6RtVzEcHQHKEeV3UqjZrdEN2J58qMq3yI6SH4Pf5yaO3wsYp7rDl7owf8Vxw8nV7J6HDde92bSfNLB%2BLYSDn8mcJUeJZ9DvByeHYEh5HZIEmBqjEW9Kp1nhY39kd0%2FMKxlLMEZywpREcfeqkBuNDImGT1swOphjUr0m7yoFRuoRONhZO0DhIUp8qMp%2BI%2BGZ9URB2%2FDv3g%2FwZ0nXnjrScjtkdIga7hBOF4Os4RsqXflHU7INYd10uoQ2Ecn99qPwcD5%2BuKJ7%2BtRR94%3D59476045678983651647832308115528","__cas__rn__":"490912947","__cas__st__212":"eb761014eef2d40b7c2bad261c6098e983f5ecc9a335b83a0f261f17f01cab78142cd91d640c0bff5197630b","__cas__id__212":"40927145","CPTK_212":"1776632285","CPID_212":"40927145","Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc":"1740900504","BDUSS_BFESS":"ndtMC1BVXpUdFF6Z3NGTHVCZlRmTXJVSXN3WUZXSkpTUXg5NU1iZGRCNlJsZXRuSVFBQUFBJCQAAAAAAAAAAAEAAAAK5OqRyqLH6cPOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAJEIxGeRCMRnV","bdindexid":"jcohnli710phtu4po08tnl0o33","ab_sr":"1.0.1_MTk4MzQ0NWIxNmJjZGNiODQzM2I2OGU2MjY0M2ExODNhZmMwMjY4MjQ5YzJkM2IzZTZjNTc1ODA1ZDdkNmFhM2QyZDMyNWZkMWRmNGMyZmNjYjFiMTJiYzgwMzc5YzA5ZDVkM2U1M2ZiNTdkN2VlZWY0ODZiNmE1MTg3Y2YzZjVhYWU5ZDZhZmIyMGY2ZWQwMzM5ODM2ZTI3ODY5Nzk5ZQ==","RT":"z=1&dm=baidu.com&si=f3786353-627c-486d-b8e5-ea5d83dc0735&ss=m7rb5t0k&sl=6&tt=p5z&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=6czl"}credential={"cipherText":"1740888411355_1740901282164_aLrE9Za0dpKtlO3CQw1IR/Yz3hP8cGXzHI/2BnYqUk5XRMPS4pr5kfk3slC7+G60AS9KjhhlCPNuQnqbFhpZS9Z7MUVTxUeQ8XlgGhrmV+FapK3+nQuTdrm1pz8Jy5qhWO0pOhQyUqv/AR5RFI0hKsasKjMYDQfng+XPMhygTo0rCb3PLrFDflBQ1riNlJ7Bg8s6TfsE3OMaJPAQsjhaZlZO1bXUAhFIY0EMqIxq2DAkMVEatrHKmDbkb0f2NJw988jZkhDEZTAJ06iAXqSLbKnbF0bPCUIqaT/a5yeqr2KtCwbJYH4flHQSoThN40a6t/XiyTqUc1Mdds6w27Q/qOyR+nPe8978fEsEB3UssJ9LPc62xsjzLmY1x5qH7eA/j7eJAgbbWVvYW8H/4N3iaauKg0D1F8NqUHMGoGVpAQSj0/HLx5pUebCoFBVBnbA2kMYD8kvavD1WzPEMte2sp2uhlSGB4IIDMkqz13eaIsc=","cookie_BDUSS":cookies_dict['BDUSS']}defgenerate_http_headers(credential):http_headers={'Cookie':'BDUSS='+credential["cookie_BDUSS"],'Cipher-Text':credential["cipherText"],'Accept':'application/json, text/plain, */*','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9','Connection':'keep-alive','Referer':'https://index.baidu.com/v2/main/index.html','Host':'index.baidu.com','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}returnhttp_headersdefcalculate_yearly_averages(start_date,end_date,data_series):# Convert the start and end dates to datetime objectsstart=datetime.strptime(start_date,'%Y-%m-%d')end=datetime.strptime(end_date,'%Y-%m-%d')days_span=(end-start).days+1# Split the data series into a list and replace empty strings with '0'data_points=data_series.split(',')data_points=['0'ifpoint==''elsepointforpointindata_points]data_points=np.array(data_points,dtype=float)ifdays_span<=366:dates=pd.date_range(start,periods=len(data_points))else:weeks_span=len(data_points)dates=pd.date_range(start,periods=weeks_span,freq='W')# Create a DataFrame with the dates and data pointsdf=pd.DataFrame({'Date':dates,'Data':data_points})df.set_index('Date',inplace=True)# Calculate the yearly averageyearly_averages=df.resample('YE').mean().reset_index()yearly_averages['Year']=yearly_averages['Date'].dt.yearyearly_averages.drop('Date',axis=1,inplace=True)yearly_averages.rename(columns={'Data':'Average'},inplace=True)# Convert DataFrame to list of tuples (year, average)yearly_averages_list=list(yearly_averages.itertuples(index=False,name=None))print(yearly_averages_list)returnyearly_averages_list# 解密defdecrypt(ptbk,index_data):n=len(ptbk)//2a=dict(zip(ptbk[:n],ptbk[n:]))return"".join([a[s]forsinindex_data])defkeywords2json(keyword):importjsonconverted_keywords=[[{"name":keyword,"wordType":1}]]# Convert the list of lists of dictionaries into a JSON stringjson_string=json.dumps(converted_keywords,ensure_ascii=False)print(json_string)returnjson_string## def namely(keywords):# return '+'.join(keywords)defcrawl_request(keyword,startDate,endDate,regionCode,credential,expectedInterval,autoSave,regionName,data_combine):print('正在查询:',keyword,startDate,endDate,regionCode)words=keywords2json(keyword)# 第一级以逗号分隔,第二级以加号分隔testwordset=keywordmax_retries=3# 最大重试次数retries=0# 当前重试次数whileretries<max_retries:try:url=f'https://index.baidu.com/api/AddWordApi/checkWordsExists?word={testwordset}'rsp=requests.get(url,headers=generate_http_headers(credential),timeout=10).json()# 若data的result不为空,则说明关键词不存在,报错并退出ifrsp['data']['result']:print(f'{testwordset}关键词不存在或组合里有不存在的关键词,请检查')return-1url=f'http://index.baidu.com/api/SearchApi/index?area=0&word={words}&area={regionCode}&startDate={startDate}&endDate={endDate}'rsp=requests.get(url,headers=generate_http_headers(credential),timeout=10).json()# 获取解密秘钥data=rsp['data']['userIndexes']uniqid=rsp['data']['uniqid']url=f'https://index.baidu.com/Interface/ptbk?uniqid={uniqid}'ptbk=requests.get(url,headers=generate_http_headers(credential),timeout=10).json()['data']# 数据解密res=[0for_inrange(len(data))]foriinrange(len(data)):index_data=decrypt(ptbk,data[i]['all']['data'])yearly_averages=calculate_yearly_averages(startDate,endDate,index_data)fortuple_iteminyearly_averages:index_d=round(tuple_item[0],2)year=tuple_item[1]ifyear>2022:continueifyearindata_combine:data_combine[year].append(index_d)else:data_combine[year]=[year,regionName,index_d]returnresexceptExceptionase:print(f'请求失败,错误信息:{e}')retries+=1print(f'重试第{retries}次...')time.sleep(random.randint(1,3))# 在重试前等待一段时间ifretries==max_retries:print(f'请求失败次数过多,已达到最大重试次数{max_retries},跳过当前连接')return-1# regions = {}provinces={901:"山东",902:"贵州",903:"江西",904:"重庆",905:"内蒙古",906:"湖北",907:"辽宁",908:"湖南",909:"福建",910:"上海",911:"北京",912:"广西",913:"广东",914:"四川",915:"云南",916:"江苏",917:"浙江",918:"青海",919:"宁夏",920:"河北",921:"黑龙江",922:"吉林",923:"天津",924:"陕西",925:"甘肃",926:"新疆",927:"河南",928:"安徽",929:"山西",930:"海南",931:"台湾",# 932: "西藏",933:"香港",934:"澳门"}regions=provincesdefcrawl(regionCode,credential,expectedInterval,autoSave,regionName,data_combine):# 获取11年到22年的数据startDate='2011-01-01'endDate='2022-12-31'# 清洗关键词keywords=['第三方支付','在线支付','移动支付','网贷','互联网理财','互联网保险','在线理财','电子银行','网银','大数据','云计算','人工智能','区块链','生物识别']# res = {regionCode: []}forkeywordinkeywords:ifregionCode!='999':try:crawl_request(keyword,startDate,endDate,regionCode,credential,expectedInterval,autoSave,regionName,data_combine)except:print(format_exc())# res[regionCode].extend(t)# 每次查询后休息一到五秒,实际上在账号很多的情况下,这个时间可以缩短time.sleep(expectedInterval/1000+random.randint(1,3)/2)if__name__=='__main__':importcsv# # 清洗关键词# titles = ['年份', '区域', '第三方支付', '在线支付', '移动支付', '网贷', '互联网理财', '互联网保险', '在线理财', '电子银行', '网银',# '大数据', '云计算', '人工智能',# '区块链', '生物识别']# with open('combine_backup.csv', 'a', encoding='utf-8-sig', newline='') as csvfile:# writer = csv.writer(csvfile)# writer.writerow(titles)forregionCodeinregions:# regionCode = 928# regionName = '安徽'regionName=regions[regionCode]data_combine={}crawl(regionCode,credential,10,True,regionName,data_combine)data_list=[]foriindata_combine:data_list.append(data_combine[i])withopen('combine_backup.csv','a',encoding='utf-8-sig',newline='')ascsvfile:writer=csv.writer(csvfile)writer.writerows(data_list)
Top comments(0)
Subscribe
For further actions, you may consider blocking this person and/orreporting abuse