import torch import os from whisper import load_model from pydub import AudioSegment def extract_audio_from_video(video_path: str) -> str: audio = AudioSegment.from_file(video_path) audio_path = "/tmp/temp_audio_test.wav" audio.export(audio_path, format="wav") print("video extracted!") return audio_path # def transcribe_audio(audio_path: str) -> str: # print("Loading model in transcribe_audio...") # from transformers import WhisperProcessor, WhisperForConditionalGeneration # import torch # # Load processor and model from transformers # processor = WhisperProcessor.from_pretrained("openai/whisper-base") # model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base") # if torch.cuda.device_count() > 1: # print(f"Using {torch.cuda.device_count()} GPUs!") # model = torch.nn.DataParallel(model) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model.to(device) # print("Model loaded successfully.") # print(audio_path) # # Load and process the audio file # import librosa # audio_input, sr = librosa.load(audio_path, sr=16000) # input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device) # # Generate transcription # with torch.no_grad(): # if isinstance(model, torch.nn.DataParallel): # generated_ids = model.module.generate(input_features) # else: # generated_ids = model.generate(input_features) # # Decode the generated tokens to text # transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] # return transcription def transcribe_audio(audio_path: str) -> str: print("Loading model in transcribe_audio...") os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" from transformers import WhisperProcessor, WhisperForConditionalGeneration import torch # Load processor and model from transformers processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") if torch.cuda.device_count() > 1: print(f"Using {torch.cuda.device_count()} GPUs!") model = torch.nn.DataParallel(model) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print("Model loaded successfully.") print(audio_path) # Load and process the audio file import librosa audio_input, sr = librosa.load(audio_path, sr=16000) input_features = processor(audio_input, sampling_rate=sr, return_tensors="pt").input_features.to(device) # Generate transcription with torch.no_grad(): if isinstance(model, torch.nn.DataParallel): generated_ids = model.module.generate(input_features) else: generated_ids = model.generate(input_features) # Decode the generated tokens to text transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] return transcription if __name__ == "__main__": extract_audio_from_video("../video/1.mp4") audio_file = "/tmp/temp_audio_test.wav" for i in range(3): print(f"\nTranscription attempt {i + 1}:") transcription = transcribe_audio(audio_file) print("Transcription:") print(transcription)