DEBUG
This commit is contained in:
32
pp.py
32
pp.py
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python3.10
|
#!/usr/bin/env python3.10
|
||||||
|
|
||||||
|
import gc
|
||||||
import os
|
import os
|
||||||
import io
|
import io
|
||||||
import oss2
|
import oss2
|
||||||
@@ -17,6 +18,7 @@ import paddle
|
|||||||
from PIL import Image, ImageFile
|
from PIL import Image, ImageFile
|
||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
paddle.set_flags({'FLAGS_fraction_of_gpu_memory_to_use': 0.6}) # 限制显存占用为GPU的80%
|
||||||
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
|
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
|
||||||
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
|
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
@@ -84,6 +86,13 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
|
|||||||
KR = PaddleOCR(use_angle_cls=True, lang="korean")
|
KR = PaddleOCR(use_angle_cls=True, lang="korean")
|
||||||
RU = PaddleOCR(use_angle_cls=True, lang="ru")
|
RU = PaddleOCR(use_angle_cls=True, lang="ru")
|
||||||
|
|
||||||
|
# 运行OCR并清理内存
|
||||||
|
def process_ocr(model, image):
|
||||||
|
result = model.ocr(image, cls=True)[0] or []
|
||||||
|
paddle.device.cuda.empty_cache() # 清理缓存
|
||||||
|
gc.collect() # 强制垃圾回收
|
||||||
|
return result
|
||||||
|
|
||||||
def process_images(conn, offset=0) -> int:
|
def process_images(conn, offset=0) -> int:
|
||||||
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
||||||
cursor.execute("SELECT id, content FROM web_images WHERE text='' AND article_category_top_id=22 LIMIT 100 OFFSET %s", (offset,))
|
cursor.execute("SELECT id, content FROM web_images WHERE text='' AND article_category_top_id=22 LIMIT 100 OFFSET %s", (offset,))
|
||||||
@@ -93,14 +102,20 @@ def process_images(conn, offset=0) -> int:
|
|||||||
continue
|
continue
|
||||||
if isinstance(image, Image.Image):
|
if isinstance(image, Image.Image):
|
||||||
image = np.array(image)
|
image = np.array(image)
|
||||||
print('---------------------', id, content)
|
print(id, content)
|
||||||
|
|
||||||
# 執行提取文字
|
# 執行提取文字
|
||||||
en = EN.ocr(image, cls=True)[0] or []
|
#en = EN.ocr(image, cls=True)[0] or []
|
||||||
ch = CH.ocr(image, cls=True)[0] or []
|
#ch = CH.ocr(image, cls=True)[0] or []
|
||||||
jp = JP.ocr(image, cls=True)[0] or []
|
#jp = JP.ocr(image, cls=True)[0] or []
|
||||||
kr = KR.ocr(image, cls=True)[0] or []
|
#kr = KR.ocr(image, cls=True)[0] or []
|
||||||
ru = RU.ocr(image, cls=True)[0] or []
|
#ru = RU.ocr(image, cls=True)[0] or []
|
||||||
|
# 处理每个模型
|
||||||
|
ru = process_ocr(RU, image)
|
||||||
|
en = process_ocr(EN, image)
|
||||||
|
ch = process_ocr(CH, image)
|
||||||
|
jp = process_ocr(JP, image)
|
||||||
|
kr = process_ocr(KR, image)
|
||||||
|
|
||||||
# 排除字符长度小于2的行, 排除纯数字的行, 排除置信度小于 0.8 的行
|
# 排除字符长度小于2的行, 排除纯数字的行, 排除置信度小于 0.8 的行
|
||||||
jp = [x for x in jp if len(x[1][0]) > 1 and not x[1][0].isdigit() and x[1][1] > 0.8]
|
jp = [x for x in jp if len(x[1][0]) > 1 and not x[1][0].isdigit() and x[1][1] > 0.8]
|
||||||
@@ -142,12 +157,17 @@ def process_images(conn, offset=0) -> int:
|
|||||||
data = json.dumps(data, ensure_ascii=False, cls=MyEncoder)
|
data = json.dumps(data, ensure_ascii=False, cls=MyEncoder)
|
||||||
c.execute("UPDATE web_images SET text = %s WHERE id = %s", (data, id))
|
c.execute("UPDATE web_images SET text = %s WHERE id = %s", (data, id))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
paddle.device.cuda.empty_cache() # 清理缓存
|
||||||
|
gc.collect() # 强制垃圾回收
|
||||||
|
paddle.device.cuda.empty_cache() # 清理缓存
|
||||||
|
gc.collect() # 强制垃圾回收
|
||||||
return offset+100
|
return offset+100
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
conn = connect_to_mysql()
|
conn = connect_to_mysql()
|
||||||
offset = 2000
|
offset = 2000
|
||||||
while True:
|
while True:
|
||||||
|
print("LOOP:", offset)
|
||||||
offset = process_images(conn, offset)
|
offset = process_images(conn, offset)
|
||||||
time.sleep(0)
|
time.sleep(0)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user