|
| 1 | +importrequests |
| 2 | +importjson |
| 3 | +importtime |
| 4 | + |
| 5 | +# Code is partially grabbed from this repository: |
| 6 | +# https://github.com/egbertbouman/youtube-comment-downloader |
| 7 | + |
| 8 | +defsearch_dict(partial,key): |
| 9 | +""" |
| 10 | + A handy function that searches for a specific `key` in a `data` dictionary/list |
| 11 | + """ |
| 12 | +ifisinstance(partial,dict): |
| 13 | +fork,vinpartial.items(): |
| 14 | +ifk==key: |
| 15 | +# found the key, return the value |
| 16 | +yieldv |
| 17 | +else: |
| 18 | +# value of the dict may be another dict, so we search there again |
| 19 | +foroinsearch_dict(v,key): |
| 20 | +yieldo |
| 21 | +elifisinstance(partial,list): |
| 22 | +# if the passed data is a list |
| 23 | +# iterate over it & search for the key at the items in the list |
| 24 | +foriinpartial: |
| 25 | +foroinsearch_dict(i,key): |
| 26 | +yieldo |
| 27 | + |
| 28 | + |
| 29 | +deffind_value(html,key,num_sep_chars=2,separator='"'): |
| 30 | +# define the start position by the position of the key + |
| 31 | +# length of key + separator length (usually : and ") |
| 32 | +start_pos=html.find(key)+len(key)+num_sep_chars |
| 33 | +# the end position is the position of the separator (such as ") |
| 34 | +# starting from the start_pos |
| 35 | +end_pos=html.find(separator,start_pos) |
| 36 | +# return the content in this range |
| 37 | +returnhtml[start_pos:end_pos] |
| 38 | + |
| 39 | + |
| 40 | +defget_comments(url): |
| 41 | +session=requests.Session() |
| 42 | +# make the request |
| 43 | +res=session.get(url) |
| 44 | +# extract the XSRF token |
| 45 | +xsrf_token=find_value(res.text,"XSRF_TOKEN",num_sep_chars=3) |
| 46 | +# parse the YouTube initial data in the <script> tag |
| 47 | +data_str=find_value(res.text,'window["ytInitialData"] = ',num_sep_chars=0,separator="\n").rstrip(";") |
| 48 | +# convert to Python dictionary instead of plain text string |
| 49 | +data=json.loads(data_str) |
| 50 | +# search for the ctoken & continuation parameter fields |
| 51 | +forrinsearch_dict(data,"itemSectionRenderer"): |
| 52 | +pagination_data=next(search_dict(r,"nextContinuationData")) |
| 53 | +ifpagination_data: |
| 54 | +# if we got something, break out of the loop, |
| 55 | +# we have the data we need |
| 56 | +break |
| 57 | + |
| 58 | +continuation_tokens= [(pagination_data['continuation'],pagination_data['clickTrackingParams'])] |
| 59 | + |
| 60 | +whilecontinuation_tokens: |
| 61 | +# keep looping until continuation tokens list is empty (no more comments) |
| 62 | +continuation,itct=continuation_tokens.pop() |
| 63 | + |
| 64 | +# construct params parameter (the ones in the URL) |
| 65 | +params= { |
| 66 | +"action_get_comments":1, |
| 67 | +"pbj":1, |
| 68 | +"ctoken":continuation, |
| 69 | +"continuation":continuation, |
| 70 | +"itct":itct, |
| 71 | + } |
| 72 | + |
| 73 | +# construct POST body data, which consists of the XSRF token |
| 74 | +data= { |
| 75 | +"session_token":xsrf_token, |
| 76 | + } |
| 77 | + |
| 78 | +# construct request headers |
| 79 | +headers= { |
| 80 | +"x-youtube-client-name":"1", |
| 81 | +"x-youtube-client-version":"2.20200731.02.01" |
| 82 | + } |
| 83 | + |
| 84 | +# make the POST request to get the comments data |
| 85 | +response=session.post("https://www.youtube.com/comment_service_ajax",params=params,data=data,headers=headers) |
| 86 | +# convert to a Python dictionary |
| 87 | +comments_data=json.loads(response.text) |
| 88 | + |
| 89 | +forcommentinsearch_dict(comments_data,"commentRenderer"): |
| 90 | +# iterate over loaded comments and yield useful info |
| 91 | +yield { |
| 92 | +"commentId":comment["commentId"], |
| 93 | +"text":''.join([c['text']forcincomment['contentText']['runs']]), |
| 94 | +"time":comment['publishedTimeText']['runs'][0]['text'], |
| 95 | +"isLiked":comment["isLiked"], |
| 96 | +"likeCount":comment["likeCount"], |
| 97 | +# "replyCount": comment["replyCount"], |
| 98 | +'author':comment.get('authorText', {}).get('simpleText',''), |
| 99 | +'channel':comment['authorEndpoint']['browseEndpoint']['browseId'], |
| 100 | +'votes':comment.get('voteCount', {}).get('simpleText','0'), |
| 101 | +'photo':comment['authorThumbnail']['thumbnails'][-1]['url'], |
| 102 | +"authorIsChannelOwner":comment["authorIsChannelOwner"], |
| 103 | + } |
| 104 | + |
| 105 | +# load continuation tokens for next comments (ctoken & itct) |
| 106 | +continuation_tokens= [(next_cdata['continuation'],next_cdata['clickTrackingParams']) |
| 107 | +fornext_cdatainsearch_dict(comments_data,'nextContinuationData')]+continuation_tokens |
| 108 | + |
| 109 | +# avoid heavy loads with popular videos |
| 110 | +time.sleep(0.1) |
| 111 | + |
| 112 | + |
| 113 | + |
| 114 | + |
| 115 | + |
| 116 | +if__name__=="__main__": |
| 117 | +# from pprint import pprint |
| 118 | +# url = "https://www.youtube.com/watch?v=jNQXAC9IVRw" |
| 119 | +# for count, comment in enumerate(get_comments(url)): |
| 120 | +# if count == 3: |
| 121 | +# break |
| 122 | +# pprint(comment) |
| 123 | +# print("="*50) |
| 124 | +importargparse |
| 125 | +importos |
| 126 | + |
| 127 | +parser=argparse.ArgumentParser(description="Simple YouTube Comment extractor") |
| 128 | +parser.add_argument("url",help="The YouTube video full URL") |
| 129 | +parser.add_argument("-l","--limit",type=int,help="Number of maximum comments to extract, helpful for longer videos") |
| 130 | +parser.add_argument("-o","--output",help="Output JSON file, e.g data.json") |
| 131 | + |
| 132 | +# parse passed arguments |
| 133 | +args=parser.parse_args() |
| 134 | +limit=args.limit |
| 135 | +output=args.output |
| 136 | +url=args.url |
| 137 | + |
| 138 | +frompprintimportpprint |
| 139 | +forcount,commentinenumerate(get_comments(url)): |
| 140 | +iflimitandcount>=limit: |
| 141 | +# break out of the loop when we exceed limit specified |
| 142 | +break |
| 143 | +ifoutput: |
| 144 | +# write comment as JSON to a file |
| 145 | +withopen(output,"a")asf: |
| 146 | +# begin writing, adding an opening brackets |
| 147 | +ifcount==0: |
| 148 | +f.write("[") |
| 149 | +f.write(json.dumps(comment,ensure_ascii=False)+",") |
| 150 | +else: |
| 151 | +pprint(comment) |
| 152 | +print("="*50) |
| 153 | +print("total comments extracted:",count) |
| 154 | +ifoutput: |
| 155 | +# remove the last comma ',' |
| 156 | +withopen(output,"rb+")asf: |
| 157 | +f.seek(-1,os.SEEK_END) |
| 158 | +f.truncate() |
| 159 | +# add "]" to close the list in the end of the file |
| 160 | +withopen(output,"a")asf: |
| 161 | +print("]",file=f) |