本文目录导读:
我来为你提供一个完整的Python文字识别(OCR)实操案例,使用常见的Tesseract和PaddleOCR两种方案。
使用Tesseract OCR
环境准备
首先安装Tesseract引擎和Python库:
# 安装Tesseract引擎 # Windows: 下载安装 https://github.com/UB-Mannheim/tesseract/wiki # macOS: brew install tesseract # Linux: sudo apt-get install tesseract-ocr # 安装Python库 pip install pytesseract pillow opencv-python
基础文字识别示例
import pytesseract
from PIL import Image
import cv2
import numpy as np
# 配置Tesseract路径(Windows需要)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
def basic_ocr(image_path):
"""基础文字识别"""
# 打开图片
image = Image.open(image_path)
# 进行OCR识别
text = pytesseract.image_to_string(image, lang='chi_sim+eng') # 中文+英文
return text
# 使用示例
result = basic_ocr('example.jpg')
print("识别结果:")
print(result)
图像预处理优化
def preprocess_and_ocr(image_path):
"""图像预处理后识别,提高准确率"""
# 读取图片
img = cv2.imread(image_path)
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 二值化处理
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
# 降噪处理
denoised = cv2.medianBlur(binary, 3)
# 进行OCR识别
# 使用不同的配置参数
custom_config = r'--oem 3 --psm 6' # OEM: 3=LSTM模式, PSM: 6=假设统一的文本块
text = pytesseract.image_to_string(denoised,
lang='chi_sim+eng',
config=custom_config)
return text
def extract_text_with_boxes(image_path):
"""提取文字和位置信息"""
img = cv2.imread(image_path)
# 获取文字位置信息
boxes = pytesseract.image_to_boxes(img, lang='chi_sim+eng')
# 绘制文字框
h, w, _ = img.shape
for box in boxes.splitlines():
box = box.split(' ')
char = box[0]
x, y, x2, y2 = int(box[1]), int(box[2]), int(box[3]), int(box[4])
cv2.rectangle(img, (x, h-y), (x2, h-y2), (0, 255, 0), 2)
cv2.putText(img, char, (x, h-y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
cv2.imshow('Text Boxes', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
# 返回详细数据
data = pytesseract.image_to_data(img, lang='chi_sim+eng', output_type=pytesseract.Output.DICT)
return data
使用PaddleOCR(推荐)
PaddleOCR在中文识别上效果更好,准确率更高。
安装
pip install paddlepaddle paddleocr
基础使用
from paddleocr import PaddleOCR
import cv2
def paddle_ocr_basic(image_path):
"""PaddleOCR基础识别"""
# 初始化OCR
ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False) # ch为中文
# 进行识别
result = ocr.ocr(image_path, cls=True)
# 解析结果
for line in result:
for item in line:
# item: [位置信息, (文字, 置信度)]
box = item[0] # 文本位置
text = item[1][0] # 识别文字
confidence = item[1][1] # 置信度
print(f"文字: {text}, 置信度: {confidence:.2f}")
return result
def draw_ocr_result(image_path, result):
"""绘制识别结果"""
img = cv2.imread(image_path)
for line in result:
for item in line:
box = item[0]
text = item[1][0]
# 绘制文本框
points = np.array(box, dtype=np.int32).reshape((-1, 1, 2))
cv2.polylines(img, [points], True, (0, 255, 0), 2)
# 显示文字
x, y = int(box[0][0]), int(box[0][1])
cv2.putText(img, text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX,
0.7, (0, 0, 255), 2)
cv2.imshow('OCR Result', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
批量处理示例
import os
from paddleocr import PaddleOCR
import json
class BatchOCRProcessor:
def __init__(self):
self.ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)
def process_single(self, image_path):
"""处理单个图片"""
result = self.ocr.ocr(image_path, cls=True)
texts = []
for line in result:
for item in line:
texts.append({
'text': item[1][0],
'confidence': float(item[1][1]),
'box': item[0]
})
return texts
def process_batch(self, input_dir, output_file):
"""批量处理文件夹中的图片"""
results = {}
# 遍历所有图片
for filename in os.listdir(input_dir):
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
filepath = os.path.join(input_dir, filename)
print(f"处理中: {filename}")
texts = self.process_single(filepath)
results[filename] = texts
# 保存到JSON文件
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
return results
# 使用示例
processor = BatchOCRProcessor()
results = processor.process_batch('images_folder', 'ocr_results.json')
从视频中提取文字
import cv2
from paddleocr import PaddleOCR
def video_ocr(video_path, output_frames=False):
"""从视频中提取文字"""
ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)
cap = cv2.VideoCapture(video_path)
fps = int(cap.get(cv2.CAP_PROP_FPS))
frame_count = 0
text_results = []
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# 每隔30帧处理一次(避免重复)
if frame_count % 30 == 0:
# OCR识别
result = ocr.ocr(frame, cls=True)
# 提取文字
frame_texts = []
for line in result:
for item in line:
text = item[1][0]
frame_texts.append(text)
# 在帧上绘制结果
box = item[0]
points = np.array(box, dtype=np.int32).reshape((-1, 1, 2))
cv2.polylines(frame, [points], True, (0, 255, 0), 2)
text_results.append({
'frame': frame_count,
'texts': frame_texts
})
# 显示实时结果
cv2.imshow('Video OCR', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
return text_results
完整实战案例:发票识别系统
import re
from paddleocr import PaddleOCR
import cv2
import json
class InvoiceOCR:
"""发票文字识别系统"""
def __init__(self):
self.ocr = PaddleOCR(use_angle_cls=True, lang='ch', use_gpu=False)
def extract_invoice_info(self, image_path):
"""提取发票关键信息"""
result = self.ocr.ocr(image_path, cls=True)
invoice_info = {
'invoice_number': None, # 发票号码
'date': None, # 日期
'total_amount': None, # 总金额
'seller': None, # 销售方
'buyer': None, # 购买方
'items': [] # 商品明细
}
full_text = ""
for line in result:
for item in line:
text = item[1][0]
confidence = item[1][1]
full_text += text + " "
# 使用正则匹配关键信息
# 匹配发票号码(数字序列)
if re.match(r'^\d{8,10}$', text):
invoice_info['invoice_number'] = text
# 匹配日期
date_pattern = r'\d{4}[-年]\d{1,2}[-月]\d{1,2}[日]?'
if re.search(date_pattern, text):
invoice_info['date'] = text
# 匹配金额
amount_pattern = r'[¥¥]?\d+\.\d{2}'
if re.search(amount_pattern, text):
invoice_info['total_amount'] = text
# 提取商品明细
lines = full_text.split('\n')
for line in lines:
if '商品' in line or '明细' in line:
invoice_info['items'].append(line)
return invoice_info
def save_to_json(self, info, output_path):
"""保存为JSON格式"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(info, f, ensure_ascii=False, indent=2)
print(f"结果已保存到: {output_path}")
# 使用示例
if __name__ == "__main__":
# 基础OCR测试
print("=== 基础OCR测试 ===")
text = basic_ocr('test_image.jpg')
print(text)
# PaddleOCR测试(推荐)
print("\n=== PaddleOCR测试 ===")
processor = InvoiceOCR()
info = processor.extract_invoice_info('invoice.jpg')
print("提取的发票信息:")
print(json.dumps(info, ensure_ascii=False, indent=2))
# 批量处理
print("\n=== 批量处理 ===")
batch_processor = BatchOCRProcessor()
results = batch_processor.process_batch('invoices_folder', 'all_invoices.json')
注意事项
- 图像质量:清晰的图片能大幅提升识别准确率
- 预处理:适当进行二值化、去噪等预处理
- 语言设置:根据图片内容选择正确的语言包
- GPU加速:如果有NVIDIA GPU,建议启用GPU加速
- 识别模式:PaddleOCR的准确率通常高于Tesseract
这个案例包含了从基础到进阶的OCR使用场景,你可以根据实际需求选择合适的方法。
标签: 图像文字提取