Code forSpeech Recognition using Transformers in Python Tutorial

AutomaticSpeechRecognition_PythonCodeTutorial.py
# %%!pip install transformers==4.28.1 soundfile sentencepiece torchaudio pydub# %%from transformers import *import torchimport soundfile as sf# import librosaimport osimport torchaudiodevice = "cuda:0" if torch.cuda.is_available() else "cpu"# %% [markdown]# # Wav2Vec2.0 Models# # %%# wav2vec2_model_name = "facebook/wav2vec2-base-960h" # 360MBwav2vec2_model_name = "facebook/wav2vec2-large-960h-lv60-self" # pretrained 1.26GB# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english" # English-only, 1.26GB# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" # Arabic-only, 1.26GB# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-spanish" # Spanish-only, 1.26GBwav2vec2_processor = Wav2Vec2Processor.from_pretrained(wav2vec2_model_name)wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(wav2vec2_model_name).to(device)# %%# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.0.wav"# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.1.wav"# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2btrop6.0.wav"# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/16-122828-0002.wav"audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/30-4447-0004.wav"# audio_url = "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0060_8k.wav"# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/7601-291468-0006.wav"# audio_url = "http://www0.cs.ucl.ac.uk/teaching/GZ05/samples/lathe.wav"# %%# load our wav filespeech, sr = torchaudio.load(audio_url)speech = speech.squeeze()# or using librosa# speech, sr = librosa.load(audio_file, sr=16000)sr, speech.shape# %%# resample from whatever the audio sampling rate to 16000resampler = torchaudio.transforms.Resample(sr, 16000)speech = resampler(speech)speech.shape# %%# tokenize our wavinput_values = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)input_values.shape# %%# perform inferencelogits = wav2vec2_model(input_values)["logits"]logits.shape# %%# use argmax to get the predicted IDspredicted_ids = torch.argmax(logits, dim=-1)predicted_ids.shape# %%# decode the IDs to texttranscription = wav2vec2_processor.decode(predicted_ids[0])transcription.lower()# %%def load_audio(audio_path):  """Load the audio file & convert to 16,000 sampling rate"""  # load our wav file  speech, sr = torchaudio.load(audio_path)  resampler = torchaudio.transforms.Resample(sr, 16000)  speech = resampler(speech)  return speech.squeeze()# %%def get_transcription_wav2vec2(audio_path, model, processor):  speech = load_audio(audio_path)  input_features = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)  # perform inference  logits = model(input_features)["logits"]  # use argmax to get the predicted IDs  predicted_ids = torch.argmax(logits, dim=-1)  transcription = processor.batch_decode(predicted_ids)[0]  return transcription.lower()# %%get_transcription_wav2vec2("http://www0.cs.ucl.ac.uk/teaching/GZ05/samples/lathe.wav",                            wav2vec2_model,                            wav2vec2_processor)# %% [markdown]# # Whisper Models# %%# whisper_model_name = "openai/whisper-tiny.en" # English-only, ~ 151 MB# whisper_model_name = "openai/whisper-base.en" # English-only, ~ 290 MB# whisper_model_name = "openai/whisper-small.en" # English-only, ~ 967 MB# whisper_model_name = "openai/whisper-medium.en" # English-only, ~ 3.06 GB# whisper_model_name = "openai/whisper-tiny" # multilingual, ~ 151 MB# whisper_model_name = "openai/whisper-base" # multilingual, ~ 290 MB# whisper_model_name = "openai/whisper-small" # multilingual, ~ 967 MBwhisper_model_name = "openai/whisper-medium" # multilingual, ~ 3.06 GB# whisper_model_name = "openai/whisper-large-v2" # multilingual, ~ 6.17 GBwhisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)# %%input_features = whisper_processor(load_audio(audio_url), sampling_rate=16000, return_tensors="pt").input_features.to(device)# %%forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="english", task="transcribe")# %%forced_decoder_ids# %%input_features.shape# %%predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=forced_decoder_ids)predicted_ids.shape# %%transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)transcription# %%transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=False)transcription# %%def get_transcription_whisper(audio_path, model, processor, language="english", skip_special_tokens=True):  # resample from whatever the audio sampling rate to 16000  speech = load_audio(audio_path)  input_features = processor(speech, return_tensors="pt", sampling_rate=16000).input_features  forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")  # print(forced_decoder_ids)  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens)[0]  return transcription# %%arabic_transcription = get_transcription_whisper("https://datasets-server.huggingface.co/assets/arabic_speech_corpus/--/clean/train/0/audio/audio.wav",                          whisper_model,                          whisper_processor,                          language="arabic",                          skip_special_tokens=True)arabic_transcription# %%spanish_transcription = get_transcription_whisper("https://www.lightbulblanguages.co.uk/resources/sp-audio/cual-es-la-fecha-cumple.mp3",                          whisper_model,                          whisper_processor,                          language="spanish",                          skip_special_tokens=True)spanish_transcription# %%from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE # supported languagesTO_LANGUAGE_CODE # %% [markdown]# # Transcribe your Voice# %%!git clone -q --depth 1 https://github.com/snakers4/silero-models%cd silero-models# %%from IPython.display import Audio, display, clear_outputfrom colab_utils import record_audioimport ipywidgets as widgetsfrom scipy.io import wavfileimport numpy as nprecord_seconds =   20#@param {type:"number", min:1, max:10, step:1}sample_rate = 16000def _record_audio(b):  clear_output()  audio = record_audio(record_seconds)  display(Audio(audio, rate=sample_rate, autoplay=True))  wavfile.write('recorded.wav', sample_rate, (32767*audio).numpy().astype(np.int16))button = widgets.Button(description="Record Speech")button.on_click(_record_audio)display(button)# %%print("Whisper:", get_transcription_whisper("recorded.wav", whisper_model, whisper_processor))print("Wav2vec2:", get_transcription_wav2vec2("recorded.wav", wav2vec2_model, wav2vec2_processor))# %% [markdown]# # Transcribing Long Audio Samples# %%def get_long_transcription_whisper(audio_path, pipe, return_timestamps=True,                                    chunk_length_s=10, stride_length_s=2):    """Get the transcription of a long audio file using the Whisper model"""    return pipe(load_audio(audio_path).numpy(), return_timestamps=return_timestamps,                  chunk_length_s=chunk_length_s, stride_length_s=stride_length_s)# %%# initialize the pipelinepipe = pipeline("automatic-speech-recognition",                 model=whisper_model_name, device=device)# %%# get the transcription of a sample long audio fileoutput = get_long_transcription_whisper(    "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0060_8k.wav",     pipe, chunk_length_s=10, stride_length_s=1)# %%output["text"]# %%for chunk in output["chunks"]:  # print the timestamp and the text  print(chunk["timestamp"], ":", chunk["text"])# %%
Ethical Hacking with Python EBook - Contact - Top
New Tutorials

Building a Full-Stack RAG Chatbot with FastAPI, OpenAI, and Streamlit
How to Recover Deleted Files with Python
How to Use Python to Track Google Search Results and Reviews Over Time
YouTube Video Transcription Summarization with Python
Getting Started with Python for SaaS Applications
Movatterモバイル変換

Code forSpeech Recognition using Transformers in Python Tutorial

Tags

New Tutorials

Popular Tutorials