DEBUG
This commit is contained in:
32
pp.py
32
pp.py
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3.10
|
||||
|
||||
import gc
|
||||
import os
|
||||
import io
|
||||
import oss2
|
||||
@@ -17,6 +18,7 @@ import paddle
|
||||
from PIL import Image, ImageFile
|
||||
from paddleocr import PaddleOCR
|
||||
|
||||
paddle.set_flags({'FLAGS_fraction_of_gpu_memory_to_use': 0.6}) # 限制显存占用为GPU的80%
|
||||
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
|
||||
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
|
||||
warnings.filterwarnings("ignore")
|
||||
@@ -84,6 +86,13 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
|
||||
KR = PaddleOCR(use_angle_cls=True, lang="korean")
|
||||
RU = PaddleOCR(use_angle_cls=True, lang="ru")
|
||||
|
||||
# 运行OCR并清理内存
|
||||
def process_ocr(model, image):
|
||||
result = model.ocr(image, cls=True)[0] or []
|
||||
paddle.device.cuda.empty_cache() # 清理缓存
|
||||
gc.collect() # 强制垃圾回收
|
||||
return result
|
||||
|
||||
def process_images(conn, offset=0) -> int:
|
||||
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
||||
cursor.execute("SELECT id, content FROM web_images WHERE text='' AND article_category_top_id=22 LIMIT 100 OFFSET %s", (offset,))
|
||||
@@ -93,14 +102,20 @@ def process_images(conn, offset=0) -> int:
|
||||
continue
|
||||
if isinstance(image, Image.Image):
|
||||
image = np.array(image)
|
||||
print('---------------------', id, content)
|
||||
print(id, content)
|
||||
|
||||
# 執行提取文字
|
||||
en = EN.ocr(image, cls=True)[0] or []
|
||||
ch = CH.ocr(image, cls=True)[0] or []
|
||||
jp = JP.ocr(image, cls=True)[0] or []
|
||||
kr = KR.ocr(image, cls=True)[0] or []
|
||||
ru = RU.ocr(image, cls=True)[0] or []
|
||||
#en = EN.ocr(image, cls=True)[0] or []
|
||||
#ch = CH.ocr(image, cls=True)[0] or []
|
||||
#jp = JP.ocr(image, cls=True)[0] or []
|
||||
#kr = KR.ocr(image, cls=True)[0] or []
|
||||
#ru = RU.ocr(image, cls=True)[0] or []
|
||||
# 处理每个模型
|
||||
ru = process_ocr(RU, image)
|
||||
en = process_ocr(EN, image)
|
||||
ch = process_ocr(CH, image)
|
||||
jp = process_ocr(JP, image)
|
||||
kr = process_ocr(KR, image)
|
||||
|
||||
# 排除字符长度小于2的行, 排除纯数字的行, 排除置信度小于 0.8 的行
|
||||
jp = [x for x in jp if len(x[1][0]) > 1 and not x[1][0].isdigit() and x[1][1] > 0.8]
|
||||
@@ -142,12 +157,17 @@ def process_images(conn, offset=0) -> int:
|
||||
data = json.dumps(data, ensure_ascii=False, cls=MyEncoder)
|
||||
c.execute("UPDATE web_images SET text = %s WHERE id = %s", (data, id))
|
||||
conn.commit()
|
||||
paddle.device.cuda.empty_cache() # 清理缓存
|
||||
gc.collect() # 强制垃圾回收
|
||||
paddle.device.cuda.empty_cache() # 清理缓存
|
||||
gc.collect() # 强制垃圾回收
|
||||
return offset+100
|
||||
|
||||
def main():
|
||||
conn = connect_to_mysql()
|
||||
offset = 2000
|
||||
while True:
|
||||
print("LOOP:", offset)
|
||||
offset = process_images(conn, offset)
|
||||
time.sleep(0)
|
||||
|
||||
|
Reference in New Issue
Block a user