|
| 1 | +importos |
| 2 | +importre |
| 3 | +importnltk |
| 4 | +importpytube |
| 5 | +importyoutube_transcript_api |
| 6 | +fromyoutube_transcript_apiimportYouTubeTranscriptApi |
| 7 | +fromnltk.corpusimportstopwords |
| 8 | +fromnltk.tokenizeimportsent_tokenize,word_tokenize |
| 9 | +fromnltk.probabilityimportFreqDist |
| 10 | +fromheapqimportnlargest |
| 11 | +fromurllib.parseimporturlparse,parse_qs |
| 12 | +importtextwrap |
| 13 | +fromcoloramaimportFore,Back,Style,init |
| 14 | +fromopenaiimportOpenAI |
| 15 | + |
| 16 | +# Initialize colorama for cross-platform colored terminal output |
| 17 | +init(autoreset=True) |
| 18 | + |
| 19 | +# Download necessary NLTK data |
| 20 | +nltk.download('punkt_tab',quiet=True) |
| 21 | +nltk.download('punkt',quiet=True) |
| 22 | +nltk.download('stopwords',quiet=True) |
| 23 | + |
| 24 | +# Initialize OpenAI client |
| 25 | +client=OpenAI( |
| 26 | +base_url="https://openrouter.ai/api/v1", |
| 27 | +api_key="<api_key>",# Add your OpenRouter API key here |
| 28 | +) |
| 29 | + |
| 30 | +defextract_video_id(youtube_url): |
| 31 | +"""Extract the video ID from a YouTube URL.""" |
| 32 | +parsed_url=urlparse(youtube_url) |
| 33 | + |
| 34 | +ifparsed_url.netloc=='youtu.be': |
| 35 | +returnparsed_url.path[1:] |
| 36 | + |
| 37 | +ifparsed_url.netlocin ('www.youtube.com','youtube.com'): |
| 38 | +ifparsed_url.path=='/watch': |
| 39 | +returnparse_qs(parsed_url.query)['v'][0] |
| 40 | +elifparsed_url.path.startswith('/embed/'): |
| 41 | +returnparsed_url.path.split('/')[2] |
| 42 | +elifparsed_url.path.startswith('/v/'): |
| 43 | +returnparsed_url.path.split('/')[2] |
| 44 | + |
| 45 | +# If no match found |
| 46 | +raiseValueError(f"Could not extract video ID from URL:{youtube_url}") |
| 47 | + |
| 48 | +defget_transcript(video_id): |
| 49 | +"""Get the transcript of a YouTube video.""" |
| 50 | +try: |
| 51 | +transcript=YouTubeTranscriptApi.get_transcript(video_id) |
| 52 | +return' '.join([entry['text']forentryintranscript]) |
| 53 | +exceptExceptionase: |
| 54 | +returnf"Error retrieving transcript:{str(e)}." |
| 55 | + |
| 56 | +defsummarize_text_nltk(text,num_sentences=5): |
| 57 | +"""Summarize text using frequency-based extractive summarization with NLTK.""" |
| 58 | +ifnottextortext.startswith("Error")ortext.startswith("Transcript not available"): |
| 59 | +returntext |
| 60 | + |
| 61 | +# Tokenize the text into sentences and words |
| 62 | +sentences=sent_tokenize(text) |
| 63 | + |
| 64 | +# If there are fewer sentences than requested, return all sentences |
| 65 | +iflen(sentences)<=num_sentences: |
| 66 | +returntext |
| 67 | + |
| 68 | +# Tokenize words and remove stopwords |
| 69 | +stop_words=set(stopwords.words('english')) |
| 70 | +words=word_tokenize(text.lower()) |
| 71 | +words= [wordforwordinwordsifword.isalnum()andwordnotinstop_words] |
| 72 | + |
| 73 | +# Calculate word frequencies |
| 74 | +freq=FreqDist(words) |
| 75 | + |
| 76 | +# Score sentences based on word frequencies |
| 77 | +sentence_scores= {} |
| 78 | +fori,sentenceinenumerate(sentences): |
| 79 | +forwordinword_tokenize(sentence.lower()): |
| 80 | +ifwordinfreq: |
| 81 | +ifiinsentence_scores: |
| 82 | +sentence_scores[i]+=freq[word] |
| 83 | +else: |
| 84 | +sentence_scores[i]=freq[word] |
| 85 | + |
| 86 | +# Get the top N sentences with highest scores |
| 87 | +summary_sentences_indices=nlargest(num_sentences,sentence_scores,key=sentence_scores.get) |
| 88 | +summary_sentences_indices.sort()# Sort to maintain original order |
| 89 | + |
| 90 | +# Construct the summary |
| 91 | +summary=' '.join([sentences[i]foriinsummary_sentences_indices]) |
| 92 | +returnsummary |
| 93 | + |
| 94 | +defsummarize_text_ai(text,video_title,num_sentences=5): |
| 95 | +"""Summarize text using the Mistral AI model via OpenRouter.""" |
| 96 | +ifnottextortext.startswith("Error")ortext.startswith("Transcript not available"): |
| 97 | +returntext |
| 98 | + |
| 99 | +# Truncate text if it's too long (models often have token limits) |
| 100 | +max_chars=15000# Adjust based on model's context window |
| 101 | +truncated_text=text[:max_chars]iflen(text)>max_charselsetext |
| 102 | + |
| 103 | +prompt=f"""Please provide a concise summary of the following YouTube video transcript. |
| 104 | +Title:{video_title} |
| 105 | +
|
| 106 | +Transcript: |
| 107 | +{truncated_text} |
| 108 | +
|
| 109 | +Create a clear, informative summary that captures the main points and key insights from the video. |
| 110 | +Your summary should be approximately{num_sentences} sentences long. |
| 111 | +""" |
| 112 | + |
| 113 | +try: |
| 114 | +completion=client.chat.completions.create( |
| 115 | +model="mistralai/mistral-small-3.1-24b-instruct:free", |
| 116 | +messages=[ |
| 117 | + { |
| 118 | +"role":"user", |
| 119 | +"content": [ |
| 120 | + { |
| 121 | +"type":"text", |
| 122 | +"text":prompt |
| 123 | + } |
| 124 | + ] |
| 125 | + } |
| 126 | + ] |
| 127 | + ) |
| 128 | +returncompletion.choices[0].message.content |
| 129 | +exceptExceptionase: |
| 130 | +returnf"Error generating AI summary:{str(e)}" |
| 131 | + |
| 132 | +defsummarize_youtube_video(youtube_url,num_sentences=5): |
| 133 | +"""Main function to summarize a YouTube video's transcription.""" |
| 134 | +try: |
| 135 | +video_id=extract_video_id(youtube_url) |
| 136 | +transcript=get_transcript(video_id) |
| 137 | + |
| 138 | +# Get video title for context |
| 139 | +try: |
| 140 | +yt=pytube.YouTube(youtube_url) |
| 141 | +video_title=yt.title |
| 142 | + |
| 143 | +exceptExceptionase: |
| 144 | +video_title="Unknown Title" |
| 145 | + |
| 146 | + |
| 147 | +# Generate both summaries |
| 148 | +print(Fore.YELLOW+f"Generating AI summary with{num_sentences} sentences...") |
| 149 | +ai_summary=summarize_text_ai(transcript,video_title,num_sentences) |
| 150 | + |
| 151 | +print(Fore.YELLOW+f"Generating NLTK summary with{num_sentences} sentences...") |
| 152 | +nltk_summary=summarize_text_nltk(transcript,num_sentences) |
| 153 | + |
| 154 | +return { |
| 155 | +"video_title":video_title, |
| 156 | +"video_id":video_id, |
| 157 | +"ai_summary":ai_summary, |
| 158 | +"nltk_summary":nltk_summary, |
| 159 | +"full_transcript_length":len(transcript.split()), |
| 160 | +"nltk_summary_length":len(nltk_summary.split()), |
| 161 | +"ai_summary_length":len(ai_summary.split())ifnotai_summary.startswith("Error")else0 |
| 162 | + } |
| 163 | +exceptExceptionase: |
| 164 | +return {"error":str(e)} |
| 165 | + |
| 166 | +defformat_time(seconds): |
| 167 | +"""Convert seconds to a readable time format.""" |
| 168 | +hours,remainder=divmod(seconds,3600) |
| 169 | +minutes,seconds=divmod(remainder,60) |
| 170 | + |
| 171 | +ifhours>0: |
| 172 | +returnf"{hours}h{minutes}m{seconds}s" |
| 173 | +elifminutes>0: |
| 174 | +returnf"{minutes}m{seconds}s" |
| 175 | +else: |
| 176 | +returnf"{seconds}s" |
| 177 | + |
| 178 | +defformat_number(number): |
| 179 | +"""Format large numbers with commas for readability.""" |
| 180 | +return"{:,}".format(number) |
| 181 | + |
| 182 | +defprint_boxed_text(text,width=80,title=None,color=Fore.WHITE): |
| 183 | +"""Print text in a nice box with optional title.""" |
| 184 | +wrapper=textwrap.TextWrapper(width=width-4)# -4 for the box margins |
| 185 | +wrapped_text=wrapper.fill(text) |
| 186 | +lines=wrapped_text.split('\n') |
| 187 | + |
| 188 | +# Print top border with optional title |
| 189 | +iftitle: |
| 190 | +title_space=width-4-len(title) |
| 191 | +left_padding=title_space//2 |
| 192 | +right_padding=title_space-left_padding |
| 193 | +print(color+'┌'+'─'*left_padding+title+'─'*right_padding+'┐') |
| 194 | +else: |
| 195 | +print(color+'┌'+'─'* (width-2)+'┐') |
| 196 | + |
| 197 | +# Print content |
| 198 | +forlineinlines: |
| 199 | +padding=width-2-len(line) |
| 200 | +print(color+'│ '+line+' '*padding+'│') |
| 201 | + |
| 202 | +# Print bottom border |
| 203 | +print(color+'└'+'─'* (width-2)+'┘') |
| 204 | + |
| 205 | +defprint_summary_result(result,width=80): |
| 206 | +"""Print the summary result in a nicely formatted way.""" |
| 207 | +if"error"inresult: |
| 208 | +print_boxed_text(f"Error:{result['error']}",width=width,title="ERROR",color=Fore.RED) |
| 209 | +return |
| 210 | + |
| 211 | +# Terminal width |
| 212 | +terminal_width=width |
| 213 | + |
| 214 | +# Print header with video information |
| 215 | +print("\n"+Fore.CYAN+"="*terminal_width) |
| 216 | +print(Fore.CYAN+Style.BRIGHT+result['video_title'].center(terminal_width)) |
| 217 | +print(Fore.CYAN+"="*terminal_width+"\n") |
| 218 | + |
| 219 | +# Video metadata section |
| 220 | +print(Fore.YELLOW+Style.BRIGHT+"VIDEO INFORMATION".center(terminal_width)) |
| 221 | +print(Fore.YELLOW+"─"*terminal_width) |
| 222 | + |
| 223 | +# Two-column layout for metadata |
| 224 | +col_width=terminal_width//2-2 |
| 225 | + |
| 226 | +# Row 3 |
| 227 | +print(f"{Fore.GREEN}Video ID:{Fore.WHITE}{result['video_id']:<{col_width}}" |
| 228 | +f"{Fore.GREEN}URL:{Fore.WHITE}https://youtu.be/{result['video_id']}") |
| 229 | + |
| 230 | +print(Fore.YELLOW+"─"*terminal_width+"\n") |
| 231 | + |
| 232 | +# AI Summary section |
| 233 | +ai_compression="N/A" |
| 234 | +ifresult['ai_summary_length']>0: |
| 235 | +ai_compression=round((1-result['ai_summary_length']/result['full_transcript_length'])*100) |
| 236 | + |
| 237 | +ai_summary_title=f" AI SUMMARY ({result['ai_summary_length']} words, condensed{ai_compression}% from{result['full_transcript_length']} words) " |
| 238 | + |
| 239 | +print(Fore.GREEN+Style.BRIGHT+ai_summary_title.center(terminal_width)) |
| 240 | +print(Fore.GREEN+"─"*terminal_width) |
| 241 | + |
| 242 | +# Print the AI summary with proper wrapping |
| 243 | +wrapper=textwrap.TextWrapper(width=terminal_width-4, |
| 244 | +initial_indent=' ', |
| 245 | +subsequent_indent=' ') |
| 246 | + |
| 247 | +# Split AI summary into paragraphs and print each |
| 248 | +ai_paragraphs=result['ai_summary'].split('\n') |
| 249 | +forparagraphinai_paragraphs: |
| 250 | +ifparagraph.strip():# Skip empty paragraphs |
| 251 | +print(wrapper.fill(paragraph)) |
| 252 | +print()# Empty line between paragraphs |
| 253 | + |
| 254 | +print(Fore.GREEN+"─"*terminal_width+"\n") |
| 255 | + |
| 256 | +# NLTK Summary section |
| 257 | +nltk_compression=round((1-result['nltk_summary_length']/result['full_transcript_length'])*100) |
| 258 | +nltk_summary_title=f" NLTK SUMMARY ({result['nltk_summary_length']} words, condensed{nltk_compression}% from{result['full_transcript_length']} words) " |
| 259 | + |
| 260 | +print(Fore.MAGENTA+Style.BRIGHT+nltk_summary_title.center(terminal_width)) |
| 261 | +print(Fore.MAGENTA+"─"*terminal_width) |
| 262 | + |
| 263 | +# Split NLTK summary into paragraphs and wrap each |
| 264 | +paragraphs=result['nltk_summary'].split('. ') |
| 265 | +formatted_paragraphs= [] |
| 266 | + |
| 267 | +current_paragraph="" |
| 268 | +forsentenceinparagraphs: |
| 269 | +ifnotsentence.endswith('.'): |
| 270 | +sentence+='.' |
| 271 | + |
| 272 | +iflen(current_paragraph)+len(sentence)+1<=150:# Arbitrary length for paragraph |
| 273 | +current_paragraph+=" "+sentenceifcurrent_paragraphelsesentence |
| 274 | +else: |
| 275 | +ifcurrent_paragraph: |
| 276 | +formatted_paragraphs.append(current_paragraph) |
| 277 | +current_paragraph=sentence |
| 278 | + |
| 279 | +ifcurrent_paragraph: |
| 280 | +formatted_paragraphs.append(current_paragraph) |
| 281 | + |
| 282 | +# Print each paragraph |
| 283 | +forparagraphinformatted_paragraphs: |
| 284 | +print(wrapper.fill(paragraph)) |
| 285 | +print()# Empty line between paragraphs |
| 286 | + |
| 287 | +print(Fore.MAGENTA+"─"*terminal_width+"\n") |
| 288 | + |
| 289 | + |
| 290 | +if__name__=="__main__": |
| 291 | +# Get terminal width |
| 292 | +try: |
| 293 | +terminal_width=os.get_terminal_size().columns |
| 294 | +# Limit width to reasonable range |
| 295 | +terminal_width=max(80,min(terminal_width,120)) |
| 296 | +except: |
| 297 | +terminal_width=80# Default if can't determine |
| 298 | + |
| 299 | +# Print welcome banner |
| 300 | +print(Fore.CYAN+Style.BRIGHT+"\n"+"="*terminal_width) |
| 301 | +print(Fore.CYAN+Style.BRIGHT+"YOUTUBE VIDEO SUMMARIZER".center(terminal_width)) |
| 302 | +print(Fore.CYAN+Style.BRIGHT+"="*terminal_width+"\n") |
| 303 | + |
| 304 | +youtube_url=input(Fore.GREEN+"Enter YouTube video URL: "+Fore.WHITE) |
| 305 | + |
| 306 | +num_sentences_input=input(Fore.GREEN+"Enter number of sentences for summaries (default 5): "+Fore.WHITE) |
| 307 | +num_sentences=int(num_sentences_input)ifnum_sentences_input.strip()else5 |
| 308 | + |
| 309 | +print(Fore.YELLOW+"\nFetching and analyzing video transcript... Please wait...\n") |
| 310 | + |
| 311 | +result=summarize_youtube_video(youtube_url,num_sentences) |
| 312 | +print_summary_result(result,width=terminal_width) |