This commit is contained in:
散仙
2024-11-21 19:56:16 +08:00
parent 6500fd5b92
commit 6e34525536

32
pp.py
View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python3.10
import gc
import os
import io
import oss2
@@ -17,6 +18,7 @@ import paddle
from PIL import Image, ImageFile
from paddleocr import PaddleOCR
paddle.set_flags({'FLAGS_fraction_of_gpu_memory_to_use': 0.6}) # 限制显存占用为GPU的80%
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
warnings.filterwarnings("ignore")
@@ -84,6 +86,13 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
KR = PaddleOCR(use_angle_cls=True, lang="korean")
RU = PaddleOCR(use_angle_cls=True, lang="ru")
# 运行OCR并清理内存
def process_ocr(model, image):
result = model.ocr(image, cls=True)[0] or []
paddle.device.cuda.empty_cache() # 清理缓存
gc.collect() # 强制垃圾回收
return result
def process_images(conn, offset=0) -> int:
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text='' AND article_category_top_id=22 LIMIT 100 OFFSET %s", (offset,))
@@ -93,14 +102,20 @@ def process_images(conn, offset=0) -> int:
continue
if isinstance(image, Image.Image):
image = np.array(image)
print('---------------------', id, content)
print(id, content)
# 執行提取文字
en = EN.ocr(image, cls=True)[0] or []
ch = CH.ocr(image, cls=True)[0] or []
jp = JP.ocr(image, cls=True)[0] or []
kr = KR.ocr(image, cls=True)[0] or []
ru = RU.ocr(image, cls=True)[0] or []
#en = EN.ocr(image, cls=True)[0] or []
#ch = CH.ocr(image, cls=True)[0] or []
#jp = JP.ocr(image, cls=True)[0] or []
#kr = KR.ocr(image, cls=True)[0] or []
#ru = RU.ocr(image, cls=True)[0] or []
# 处理每个模型
ru = process_ocr(RU, image)
en = process_ocr(EN, image)
ch = process_ocr(CH, image)
jp = process_ocr(JP, image)
kr = process_ocr(KR, image)
# 排除字符长度小于2的行, 排除纯数字的行, 排除置信度小于 0.8 的行
jp = [x for x in jp if len(x[1][0]) > 1 and not x[1][0].isdigit() and x[1][1] > 0.8]
@@ -142,12 +157,17 @@ def process_images(conn, offset=0) -> int:
data = json.dumps(data, ensure_ascii=False, cls=MyEncoder)
c.execute("UPDATE web_images SET text = %s WHERE id = %s", (data, id))
conn.commit()
paddle.device.cuda.empty_cache() # 清理缓存
gc.collect() # 强制垃圾回收
paddle.device.cuda.empty_cache() # 清理缓存
gc.collect() # 强制垃圾回收
return offset+100
def main():
conn = connect_to_mysql()
offset = 2000
while True:
print("LOOP:", offset)
offset = process_images(conn, offset)
time.sleep(0)