master

分支 (1)

管理

管理

master

script_recognizer
/
distil_whisper.py

import torch
import time
from pathlib import Path
from transformers import AutoModelForSpeechSeq2Seq, WhisperForConditionalGeneration, WhisperProcessor, AutoProcessor, pipeline
from loguru import logger
from pyAudioAnalysis import audioBasicIO as aIO
import librosa

class WhisperPipe:
    def __init__(self, args: dict):
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self._load_model(args['model_id'])

    def recognize(self, audio_path: str):
        x, sampling_rate = librosa.load(audio_path, sr=None)  # wav sample rate: 48000
        resampled = librosa.resample(x, orig_sr=sampling_rate, target_sr=16000)
        # process audio
        inputs = self.processor(resampled, return_tensors='pt', sampling_rate=16000)
        # prompt_text = "Codriver calls in Rallying:"
        prompt_text = ''
        prompt_ids = self.processor.get_prompt_ids(prompt_text, return_tensors='pt')
        i_features = inputs.input_features
        i_features = i_features.to(self.device)
        prompt_ids = prompt_ids.to(self.device)
        generated_ids = self.model.generate(input_features=i_features,
                                            prompt_ids=prompt_ids,
                                            language="english")

        # decode
        transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)
        txt: str = transcription[0]
        return txt[txt.index(prompt_text)+len(prompt_text) : ]

    def _load_model(self, model_id: str):
        torch_dtype = torch.float32 if torch.cuda.is_available() else torch.float32
        print(self.device)

        logger.info('going to load model')
        model = WhisperForConditionalGeneration.from_pretrained(model_id, use_safetensors=True, torch_dtype=torch_dtype)
        processor = WhisperProcessor.from_pretrained('openai/whisper-small')
        model.to(self.device)

        self.model = model
        self.processor = processor

# if __name__ == '__main__':
#     print(WhisperPipe({'model_id': '/root/ztmz/script_recognizer/whisper-small-hi/checkpoint-1700-best-2000steps-1000'}).recognize('/mnt/c/Users/som-13700/Desktop/ea_wrc_recorded_pieces/克罗地亚音频5月14/CROATIA RALLY_Kostanjevac/-2-0.133-638513087426310322.wav'))