|
1 | 1 | fromrequests_htmlimportHTMLSession
|
2 | 2 | frombs4importBeautifulSoupasbs
|
| 3 | +importre |
| 4 | +importjson |
3 | 5 |
|
4 | 6 | # init session
|
5 | 7 | session=HTMLSession()
|
@@ -27,22 +29,30 @@ def get_video_info(url):
|
27 | 29 | result["duration"]=soup.find("span", {"class":"ytp-time-duration"}).text
|
28 | 30 | # get the video tags
|
29 | 31 | result["tags"]=', '.join([meta.attrs.get("content")formetainsoup.find_all("meta", {"property":"og:video:tag"}) ])
|
30 |
| -# number of likes |
31 |
| -text_yt_formatted_strings=soup.find_all("yt-formatted-string", {"id":"text","class":"ytd-toggle-button-renderer"}) |
32 |
| -result["likes"]=''.join([cforcintext_yt_formatted_strings[0].attrs.get("aria-label")ifc.isdigit() ]) |
33 |
| -result["likes"]=0ifresult['likes']==''elseint(result['likes']) |
34 |
| -# number of dislikes |
35 |
| -result["dislikes"]=''.join([cforcintext_yt_formatted_strings[1].attrs.get("aria-label")ifc.isdigit() ]) |
36 |
| -result['dislikes']=0ifresult['dislikes']==''elseint(result['dislikes']) |
37 | 32 |
|
| 33 | +# Additional video and channel information (with help from: https://stackoverflow.com/a/68262735) |
| 34 | +data=re.search(r"var ytInitialData = ({.*?});",soup.prettify()).group(1) |
| 35 | +data_json=json.loads(data) |
| 36 | +videoPrimaryInfoRenderer=data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][0]['videoPrimaryInfoRenderer'] |
| 37 | +videoSecondaryInfoRenderer=data_json['contents']['twoColumnWatchNextResults']['results']['results']['contents'][1]['videoSecondaryInfoRenderer'] |
| 38 | +# number of likes |
| 39 | +likes_label=videoPrimaryInfoRenderer['videoActions']['menuRenderer']['topLevelButtons'][0]['toggleButtonRenderer']['defaultText']['accessibility']['accessibilityData']['label']# "No likes" or "###,### likes" |
| 40 | +likes_str=likes_label.split(' ')[0].replace(',','') |
| 41 | +result["likes"]='0'iflikes_str=='No'elselikes_str |
| 42 | +# number of dislikes - YouTube does not publish this anymore...? |
| 43 | +# result["dislikes"] = ''.join([ c for c in text_yt_formatted_strings[1].attrs.get("aria-label") if c.isdigit() ]) |
| 44 | +# result["dislikes"] = '0' if result['dislikes'] == '' else result['dislikes'] |
| 45 | +result['dislikes']='UNKNOWN' |
| 46 | + |
38 | 47 | # channel details
|
39 |
| -channel_tag=soup.find("yt-formatted-string",{"class":"ytd-channel-name"}).find("a") |
| 48 | +channel_tag=soup.find("meta",itemprop="channelId")['content'] |
40 | 49 | # channel name
|
41 |
| -channel_name=channel_tag.text |
| 50 | +channel_name=soup.find("span",itemprop="author").next.next['content'] |
42 | 51 | # channel URL
|
43 |
| -channel_url=f"https://www.youtube.com{channel_tag['href']}" |
| 52 | +# channel_url = soup.find("span", itemprop="author").next['href'] |
| 53 | +channel_url=f"https://www.youtube.com{channel_tag}" |
44 | 54 | # number of subscribers as str
|
45 |
| -channel_subscribers=soup.find("yt-formatted-string", {"id":"owner-sub-count"}).text.strip() |
| 55 | +channel_subscribers=videoSecondaryInfoRenderer['owner']['videoOwnerRenderer']['subscriberCountText']['accessibility']['accessibilityData']['label'] |
46 | 56 | result['channel']= {'name':channel_name,'url':channel_url,'subscribers':channel_subscribers}
|
47 | 57 | returnresult
|
48 | 58 |
|
|