本文目录导读:
- 方法一:使用SpeechRecognition库(在线识别)
- 方法二:使用百度语音识别API
- 方法三:使用Whisper(OpenAI开源模型,支持离线)
- 方法四:完整GUI应用(使用Tkinter)
- 方法五:语音转文字的进阶应用
- 使用注意事项
- 最佳实践建议
我将为您提供一个完整的Python语音转文字案例,使用多种主流方法实现。
使用SpeechRecognition库(在线识别)
安装依赖
pip install SpeechRecognition pip install PyAudio # 录音所需 pip install pocketsphinx # 离线识别
基本示例代码
import speech_recognition as sr
# 初始化识别器
recognizer = sr.Recognizer()
def record_and_recognize():
"""使用麦克风录音并识别"""
with sr.Microphone() as source:
print("请说话...")
# 调整环境噪音
recognizer.adjust_for_ambient_noise(source, duration=1)
# 录音
audio = recognizer.listen(source, timeout=5, phrase_time_limit=10)
print("录音完成,正在识别...")
try:
# 使用Google语音识别(需要联网)
text = recognizer.recognize_google(audio, language='zh-CN')
print(f"识别结果: {text}")
return text
except sr.UnknownValueError:
print("无法识别语音")
except sr.RequestError as e:
print(f"请求出错: {e}")
except Exception as e:
print(f"错误: {e}")
return ""
# 执行识别
result = record_and_recognize()
print(f"最终结果: {result}")
使用百度语音识别API
安装依赖
pip install baidu-aip
百度API实现
from aip import AipSpeech
import wave
import pyaudio
class BaiduSpeechRecognition:
def __init__(self, app_id, api_key, secret_key):
"""初始化百度语音识别客户端"""
self.client = AipSpeech(app_id, api_key, secret_key)
def record_audio(self, filename="output.wav", record_seconds=5):
"""录制音频"""
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print("开始录音...")
frames = []
for _ in range(0, int(RATE / CHUNK * record_seconds)):
data = stream.read(CHUNK)
frames.append(data)
print("录音结束!")
stream.stop_stream()
stream.close()
p.terminate()
# 保存音频文件
wf = wave.open(filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
return filename
def recognize_from_file(self, audio_file):
"""从音频文件识别语音"""
with open(audio_file, 'rb') as fp:
result = self.client.asr(fp.read(), 'wav', 16000, {
'dev_pid': 1537, # 普通话(支持简单的英文识别)
})
if result['err_no'] == 0:
return result['result'][0]
else:
print(f"识别失败: {result['err_msg']}")
return ""
# 使用示例
def baidu_demo():
# 在百度AI平台申请:https://console.bce.baidu.com/ai/#/ai/speech/overview/resource/getFree
APP_ID = '你的APP_ID'
API_KEY = '你的API_KEY'
SECRET_KEY = '你的SECRET_KEY'
recognizer = BaiduSpeechRecognition(APP_ID, API_KEY, SECRET_KEY)
# 录音
audio_file = recognizer.record_audio(record_seconds=5)
# 识别
result = recognizer.recognize_from_file(audio_file)
print(f"识别结果: {result}")
# 运行
# baidu_demo()
使用Whisper(OpenAI开源模型,支持离线)
安装依赖
pip install openai-whisper pip install pyaudio # 如果需要录音
Whisper实现
import whisper
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wavfile
class WhisperRecognition:
def __init__(self, model_size="base"):
"""
初始化Whisper模型
model_size: tiny, base, small, medium, large
"""
print(f"加载Whisper {model_size} 模型...")
self.model = whisper.load_model(model_size)
def record_audio(self, duration=5, sample_rate=16000):
"""录制音频"""
print("开始录音...")
recording = sd.rec(int(duration * sample_rate),
samplerate=sample_rate,
channels=1)
sd.wait()
print("录音结束!")
return recording, sample_rate
def recognize_from_array(self, audio_array, sample_rate=16000):
"""从音频数组识别"""
result = self.model.transcribe(audio_array, language='zh')
return result["text"]
def recognize_from_file(self, audio_file):
"""从音频文件识别"""
result = self.model.transcribe(audio_file, language='zh')
return result["text"]
# 使用示例
def whisper_demo():
recognizer = WhisperRecognition(model_size="base")
# 方法1:实时录音识别
print("=== 实时录音识别 ===")
audio_data, sr = recognizer.record_audio(duration=5)
text = recognizer.recognize_from_array(audio_data[:, 0], sr)
print(f"识别结果: {text}")
# 方法2:从文件识别
# print("=== 文件识别 ===")
# text = recognizer.recognize_from_file("test_audio.wav")
# print(f"识别结果: {text}")
# 运行
# whisper_demo()
完整GUI应用(使用Tkinter)
import tkinter as tk
from tkinter import scrolledtext
import threading
import speech_recognition as sr
class SpeechToTextApp:
def __init__(self, root):
self.root = root
self.root.title("语音转文字")
self.root.geometry("600x500")
self.recognizer = sr.Recognizer()
self.is_recording = False
self.setup_ui()
def setup_ui(self):
# 标题
title = tk.Label(self.root, text="语音转文字工具", font=("Arial", 20))
title.pack(pady=10)
# 按钮框架
btn_frame = tk.Frame(self.root)
btn_frame.pack(pady=10)
self.record_btn = tk.Button(btn_frame, text="开始录音",
command=self.toggle_recording,
bg="green", fg="white", width=15)
self.record_btn.pack(side=tk.LEFT, padx=5)
clear_btn = tk.Button(btn_frame, text="清除",
command=self.clear_text,
bg="orange", width=15)
clear_btn.pack(side=tk.LEFT, padx=5)
copy_btn = tk.Button(btn_frame, text="复制结果",
command=self.copy_text,
bg="blue", fg="white", width=15)
copy_btn.pack(side=tk.LEFT, padx=5)
# 文本显示区域
self.text_area = scrolledtext.ScrolledText(self.root,
wrap=tk.WORD,
width=70,
height=20,
font=("Arial", 12))
self.text_area.pack(padx=20, pady=10, fill=tk.BOTH, expand=True)
# 状态栏
self.status_label = tk.Label(self.root, text="准备就绪",
bd=1, relief=tk.SUNKEN, anchor=tk.W)
self.status_label.pack(fill=tk.X)
def toggle_recording(self):
if not self.is_recording:
self.is_recording = True
self.record_btn.config(text="停止录音", bg="red")
self.status_label.config(text="正在录音...")
# 在独立线程中录音
thread = threading.Thread(target=self.record_and_recognize)
thread.daemon = True
thread.start()
else:
self.is_recording = False
self.record_btn.config(text="开始录音", bg="green")
self.status_label.config(text="录音已停止")
def record_and_recognize(self):
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
try:
audio = self.recognizer.listen(source, timeout=10, phrase_time_limit=5)
self.status_label.config(text="正在识别...")
text = self.recognizer.recognize_google(audio, language='zh-CN')
self.text_area.insert(tk.END, f"我: {text}\n")
self.text_area.see(tk.END)
self.status_label.config(text="识别完成")
except sr.WaitTimeoutError:
self.status_label.config(text="超时,未检测到语音")
except sr.UnknownValueError:
self.status_label.config(text="无法识别语音")
except sr.RequestError as e:
self.status_label.config(text=f"网络错误: {e}")
except Exception as e:
self.status_label.config(text=f"错误: {str(e)}")
finally:
self.is_recording = False
self.record_btn.config(text="开始录音", bg="green")
def clear_text(self):
self.text_area.delete(1.0, tk.END)
self.status_label.config(text="内容已清除")
def copy_text(self):
content = self.text_area.get(1.0, tk.END)
self.root.clipboard_clear()
self.root.clipboard_append(content)
self.status_label.config(text="已复制到剪贴板")
# 运行GUI应用
def run_gui():
root = tk.Tk()
app = SpeechToTextApp(root)
root.mainloop()
# 取消注释运行
# run_gui()
语音转文字的进阶应用
实时语音识别 + 关键词提取
import speech_recognition as sr
import jieba
from collections import Counter
class AdvancedSTT:
def __init__(self):
self.recognizer = sr.Recognizer()
self.text_history = []
def real_time_recognition(self):
"""实时语音识别并统计关键词"""
with sr.Microphone() as source:
print("开始监听,按Ctrl+C退出...")
self.recognizer.adjust_for_ambient_noise(source, duration=1)
try:
while True:
print("请说话...")
audio = self.recognizer.listen(source, timeout=5)
text = self.recognizer.recognize_google(audio, language='zh-CN')
print(f"识别: {text}")
self.text_history.append(text)
# 关键词提取
words = jieba.lcut(text)
important_words = [w for w in words if len(w) > 1]
if important_words:
print(f"关键词: {important_words}")
except KeyboardInterrupt:
print("\n结束录音")
self.show_statistics()
except Exception as e:
print(f"错误: {e}")
def show_statistics(self):
"""显示统计信息"""
print("\n=== 统计信息 ===")
print(f"总句子数: {len(self.text_history)}")
# 统计词频
all_words = []
for text in self.text_history:
all_words.extend(jieba.lcut(text))
word_freq = Counter(all_words)
print("高频词:")
for word, count in word_freq.most_common(5):
print(f" {word}: {count}次")
# 使用示例
def advanced_demo():
stt = AdvancedSTT()
stt.real_time_recognition()
# 取消注释运行
# advanced_demo()
使用注意事项
- 网络要求:在线API需要稳定的网络连接
- 音频质量:保持麦克风清晰,减少环境噪音
- 模型选择:Whisper的large模型效果最好但需要更多资源
- API密钥:使用百度等API需要申请并妥善保管密钥
- 隐私保护:在线服务会传输音频数据,敏感内容建议使用离线方案
最佳实践建议
# 根据需求选择合适的方案
def get_stt_solution(use_case):
"""
根据使用场景推荐方案
"""
solutions = {
"quick_demo": "SpeechRecognition + Google API",
"production": "百度/阿里云 API (稳定可靠)",
"offline": "Whisper (支持离线,准确度高)",
"realtime": "Vosk (轻量级,适合实时应用)",
"batch": "讯飞API (适合大批量处理)"
}
return solutions.get(use_case, "请选择: quick_demo, production, offline, realtime, batch")
print(get_stt_solution("offline")) # 输出: Whisper
您可以根据实际需求选择合适的方法,如果您需要特定平台的实现或有其他要求,请告诉我!
标签: Python编程