from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess import torch from pathlib import Path from utils.logger_config import setup_logger logger = setup_logger(__name__) class SenseVoiceTranscriber: def __init__(self, model_dir="/data/data/luosy/models/iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn"): """ Initialize SenseVoice transcriber Args: model_dir: Model directory or huggingface model name """ try: # Detect device if torch.backends.mps.is_available(): device = "mps" logger.info("Using MPS acceleration") elif torch.cuda.is_available(): device = "cuda:0" logger.info("Using CUDA acceleration") else: device = "cpu" logger.info("Using CPU processing") logger.info(f"Loading SenseVoice model (model={model_dir}, device={device})") self.model = AutoModel( model=model_dir, model_revision="v2.0.4", vad_model="fsmn-vad", # 语音活动检测模型,切割长音频 vad_model_revision="v2.0.4", punc_model="ct-punc-c", # 语音标点模型,添加标点符号 punc_model_revision="v2.0.4", spk_model="cam++", # 语音识别模型,判断说话人 trust_remote_code=True, disable_update=True, vad_kwargs={ "max_single_segment_time": 15000, "min_duration": 500, "speech_pad": 300 }, punc_kwargs={ "window_size": 128, "period_symbol": "。" }, spk_kwargs={ "spk_threshold": 0.7 }, device=device ) logger.info("SenseVoice model loaded successfully") except Exception as e: logger.error(f"Failed to load SenseVoice model: {str(e)}") raise def transcribe(self, audio_path: str) -> str: """ Transcribe audio to text Args: audio_path: Path to audio file Returns: str: Transcribed text or empty string if no speech detected """ try: logger.info(f"开始处理音频文件: {audio_path}") # Generate transcription with no gradient computation with torch.no_grad(): res = self.model.generate( input=audio_path, cache={}, speaker_info={"spk_num": 2}, language="zh", use_itn=True, batch_size_s=30, hotword=["材质", "面料", "版型", "合身"], beam_size=20, merge_vad=True, merge_length_s=10, without_timestamps=False, ban_emo_unk=True, sentence_timestamp=True ) # Return empty string if no results if not res or not res[0].get("text"): logger.info("No speech detected") return "" # Get transcription text # transcript = rich_transcription_postprocess(res[0]["timestamp"]) transcript = res[0] logger.info("STT执行完成!") # logger.debug(f"Transcription result:\n{transcript}") return transcript except Exception as e: logger.error(f"Audio processing failed: {str(e)}") return "" # Return empty string on error