From 6e34525536a8487a290ee83e5f9adf64499ef837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=95=A3=E4=BB=99?= Date: Thu, 21 Nov 2024 19:56:16 +0800 Subject: [PATCH] DEBUG --- pp.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/pp.py b/pp.py index dafa580..58fe3e4 100755 --- a/pp.py +++ b/pp.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3.10 +import gc import os import io import oss2 @@ -17,6 +18,7 @@ import paddle from PIL import Image, ImageFile from paddleocr import PaddleOCR +paddle.set_flags({'FLAGS_fraction_of_gpu_memory_to_use': 0.6}) # 限制显存占用为GPU的80% logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印 logging.disable(logging.WARNING) # 关闭WARNING日志的打印 warnings.filterwarnings("ignore") @@ -84,6 +86,13 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan") KR = PaddleOCR(use_angle_cls=True, lang="korean") RU = PaddleOCR(use_angle_cls=True, lang="ru") +# 运行OCR并清理内存 +def process_ocr(model, image): + result = model.ocr(image, cls=True)[0] or [] + paddle.device.cuda.empty_cache() # 清理缓存 + gc.collect() # 强制垃圾回收 + return result + def process_images(conn, offset=0) -> int: with conn.cursor(pymysql.cursors.SSCursor) as cursor: cursor.execute("SELECT id, content FROM web_images WHERE text='' AND article_category_top_id=22 LIMIT 100 OFFSET %s", (offset,)) @@ -93,14 +102,20 @@ def process_images(conn, offset=0) -> int: continue if isinstance(image, Image.Image): image = np.array(image) - print('---------------------', id, content) + print(id, content) # 執行提取文字 - en = EN.ocr(image, cls=True)[0] or [] - ch = CH.ocr(image, cls=True)[0] or [] - jp = JP.ocr(image, cls=True)[0] or [] - kr = KR.ocr(image, cls=True)[0] or [] - ru = RU.ocr(image, cls=True)[0] or [] + #en = EN.ocr(image, cls=True)[0] or [] + #ch = CH.ocr(image, cls=True)[0] or [] + #jp = JP.ocr(image, cls=True)[0] or [] + #kr = KR.ocr(image, cls=True)[0] or [] + #ru = RU.ocr(image, cls=True)[0] or [] + # 处理每个模型 + ru = process_ocr(RU, image) + en = process_ocr(EN, image) + ch = process_ocr(CH, image) + jp = process_ocr(JP, image) + kr = process_ocr(KR, image) # 排除字符长度小于2的行, 排除纯数字的行, 排除置信度小于 0.8 的行 jp = [x for x in jp if len(x[1][0]) > 1 and not x[1][0].isdigit() and x[1][1] > 0.8] @@ -142,12 +157,17 @@ def process_images(conn, offset=0) -> int: data = json.dumps(data, ensure_ascii=False, cls=MyEncoder) c.execute("UPDATE web_images SET text = %s WHERE id = %s", (data, id)) conn.commit() + paddle.device.cuda.empty_cache() # 清理缓存 + gc.collect() # 强制垃圾回收 + paddle.device.cuda.empty_cache() # 清理缓存 + gc.collect() # 强制垃圾回收 return offset+100 def main(): conn = connect_to_mysql() offset = 2000 while True: + print("LOOP:", offset) offset = process_images(conn, offset) time.sleep(0)