Movatterモバイル変換


[0]ホーム

URL:



Code forSpeech Recognition using Transformers in Python Tutorial


View on Github

Open In Colab

AutomaticSpeechRecognition_PythonCodeTutorial.py

# %%!pip install transformers==4.28.1 soundfile sentencepiece torchaudio pydub# %%from transformers import *import torchimport soundfile as sf# import librosaimport osimport torchaudiodevice = "cuda:0" if torch.cuda.is_available() else "cpu"# %% [markdown]# # Wav2Vec2.0 Models# # %%# wav2vec2_model_name = "facebook/wav2vec2-base-960h" # 360MBwav2vec2_model_name = "facebook/wav2vec2-large-960h-lv60-self" # pretrained 1.26GB# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english" # English-only, 1.26GB# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic" # Arabic-only, 1.26GB# wav2vec2_model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-spanish" # Spanish-only, 1.26GBwav2vec2_processor = Wav2Vec2Processor.from_pretrained(wav2vec2_model_name)wav2vec2_model = Wav2Vec2ForCTC.from_pretrained(wav2vec2_model_name).to(device)# %%# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.0.wav"# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2bjrop1.1.wav"# audio_url = "http://www.fit.vutbr.cz/~motlicek/sympatex/f2btrop6.0.wav"# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/16-122828-0002.wav"audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/30-4447-0004.wav"# audio_url = "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0060_8k.wav"# audio_url = "https://github.com/x4nth055/pythoncode-tutorials/raw/master/machine-learning/speech-recognition/7601-291468-0006.wav"# audio_url = "http://www0.cs.ucl.ac.uk/teaching/GZ05/samples/lathe.wav"# %%# load our wav filespeech, sr = torchaudio.load(audio_url)speech = speech.squeeze()# or using librosa# speech, sr = librosa.load(audio_file, sr=16000)sr, speech.shape# %%# resample from whatever the audio sampling rate to 16000resampler = torchaudio.transforms.Resample(sr, 16000)speech = resampler(speech)speech.shape# %%# tokenize our wavinput_values = wav2vec2_processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)input_values.shape# %%# perform inferencelogits = wav2vec2_model(input_values)["logits"]logits.shape# %%# use argmax to get the predicted IDspredicted_ids = torch.argmax(logits, dim=-1)predicted_ids.shape# %%# decode the IDs to texttranscription = wav2vec2_processor.decode(predicted_ids[0])transcription.lower()# %%def load_audio(audio_path):  """Load the audio file & convert to 16,000 sampling rate"""  # load our wav file  speech, sr = torchaudio.load(audio_path)  resampler = torchaudio.transforms.Resample(sr, 16000)  speech = resampler(speech)  return speech.squeeze()# %%def get_transcription_wav2vec2(audio_path, model, processor):  speech = load_audio(audio_path)  input_features = processor(speech, return_tensors="pt", sampling_rate=16000)["input_values"].to(device)  # perform inference  logits = model(input_features)["logits"]  # use argmax to get the predicted IDs  predicted_ids = torch.argmax(logits, dim=-1)  transcription = processor.batch_decode(predicted_ids)[0]  return transcription.lower()# %%get_transcription_wav2vec2("http://www0.cs.ucl.ac.uk/teaching/GZ05/samples/lathe.wav",                            wav2vec2_model,                            wav2vec2_processor)# %% [markdown]# # Whisper Models# %%# whisper_model_name = "openai/whisper-tiny.en" # English-only, ~ 151 MB# whisper_model_name = "openai/whisper-base.en" # English-only, ~ 290 MB# whisper_model_name = "openai/whisper-small.en" # English-only, ~ 967 MB# whisper_model_name = "openai/whisper-medium.en" # English-only, ~ 3.06 GB# whisper_model_name = "openai/whisper-tiny" # multilingual, ~ 151 MB# whisper_model_name = "openai/whisper-base" # multilingual, ~ 290 MB# whisper_model_name = "openai/whisper-small" # multilingual, ~ 967 MBwhisper_model_name = "openai/whisper-medium" # multilingual, ~ 3.06 GB# whisper_model_name = "openai/whisper-large-v2" # multilingual, ~ 6.17 GBwhisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)# %%input_features = whisper_processor(load_audio(audio_url), sampling_rate=16000, return_tensors="pt").input_features.to(device)# %%forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="english", task="transcribe")# %%forced_decoder_ids# %%input_features.shape# %%predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=forced_decoder_ids)predicted_ids.shape# %%transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)transcription# %%transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=False)transcription# %%def get_transcription_whisper(audio_path, model, processor, language="english", skip_special_tokens=True):  # resample from whatever the audio sampling rate to 16000  speech = load_audio(audio_path)  input_features = processor(speech, return_tensors="pt", sampling_rate=16000).input_features  forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")  # print(forced_decoder_ids)  predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens)[0]  return transcription# %%arabic_transcription = get_transcription_whisper("https://datasets-server.huggingface.co/assets/arabic_speech_corpus/--/clean/train/0/audio/audio.wav",                          whisper_model,                          whisper_processor,                          language="arabic",                          skip_special_tokens=True)arabic_transcription# %%spanish_transcription = get_transcription_whisper("https://www.lightbulblanguages.co.uk/resources/sp-audio/cual-es-la-fecha-cumple.mp3",                          whisper_model,                          whisper_processor,                          language="spanish",                          skip_special_tokens=True)spanish_transcription# %%from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE # supported languagesTO_LANGUAGE_CODE # %% [markdown]# # Transcribe your Voice# %%!git clone -q --depth 1 https://github.com/snakers4/silero-models%cd silero-models# %%from IPython.display import Audio, display, clear_outputfrom colab_utils import record_audioimport ipywidgets as widgetsfrom scipy.io import wavfileimport numpy as nprecord_seconds =   20#@param {type:"number", min:1, max:10, step:1}sample_rate = 16000def _record_audio(b):  clear_output()  audio = record_audio(record_seconds)  display(Audio(audio, rate=sample_rate, autoplay=True))  wavfile.write('recorded.wav', sample_rate, (32767*audio).numpy().astype(np.int16))button = widgets.Button(description="Record Speech")button.on_click(_record_audio)display(button)# %%print("Whisper:", get_transcription_whisper("recorded.wav", whisper_model, whisper_processor))print("Wav2vec2:", get_transcription_wav2vec2("recorded.wav", wav2vec2_model, wav2vec2_processor))# %% [markdown]# # Transcribing Long Audio Samples# %%def get_long_transcription_whisper(audio_path, pipe, return_timestamps=True,                                    chunk_length_s=10, stride_length_s=2):    """Get the transcription of a long audio file using the Whisper model"""    return pipe(load_audio(audio_path).numpy(), return_timestamps=return_timestamps,                  chunk_length_s=chunk_length_s, stride_length_s=stride_length_s)# %%# initialize the pipelinepipe = pipeline("automatic-speech-recognition",                 model=whisper_model_name, device=device)# %%# get the transcription of a sample long audio fileoutput = get_long_transcription_whisper(    "https://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0060_8k.wav",     pipe, chunk_length_s=10, stride_length_s=1)# %%output["text"]# %%for chunk in output["chunks"]:  # print the timestamp and the text  print(chunk["timestamp"], ":", chunk["text"])# %%

Ethical Hacking with Python EBook - Contact - Top


Join 50,000+ Python Programmers & Enthusiasts like you!



Tags

Ethical Hacking with Python EBook - Topic - Middle


New Tutorials

Popular Tutorials


Ethical Hacking with Python EBook - Topic - Bottom







[8]ページ先頭

©2009-2025 Movatter.jp