diff --git a/main.py b/main.py index 48ad249..c524ec6 100755 --- a/main.py +++ b/main.py @@ -55,9 +55,9 @@ def process_images(conn, ocr, es): item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] text = ' '.join([x['text'] for x in item]) print(id, text) - save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) - es.index(index='web_images', id=id, body={'content': text}) - conn.commit() + #save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) + #es.index(index='web_images', id=id, body={'content': text}) + #conn.commit() def main(): es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) diff --git a/pp.py b/pp.py index 1b334e8..ecfd4f1 100755 --- a/pp.py +++ b/pp.py @@ -7,11 +7,14 @@ import pymysql import json import numpy as np import warnings +import logging from PIL import Image, ImageFile from dotenv import dotenv_values from elasticsearch import Elasticsearch from paddleocr import PaddleOCR +logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印 +logging.disable(logging.WARNING) # 关闭WARNING日志的打印 warnings.filterwarnings("ignore") ImageFile.LOAD_TRUNCATED_IMAGES = True config = dotenv_values(".env") @@ -34,8 +37,8 @@ def download_image(url:str) -> Image.Image: else: response = requests.get(url) return Image.open(io.BytesIO(response.content)) - except Exception: - print(f'图片从{url}下载失败') + except Exception as e: + print(f'图片从{url}下载失败,错误信息为:{e}') return None def connect_to_mysql(): @@ -51,11 +54,14 @@ KR = PaddleOCR(use_angle_cls=True, lang="korean") def process_images(conn, es): with conn.cursor(pymysql.cursors.SSCursor) as cursor: - cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10") + cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10") for id, content in cursor.fetchall(): image = download_image(content) if image is None: continue + if isinstance(image, Image.Image): + image = np.array(image) + print('---------------------', id, content) print(CH.ocr(image, cls=True)) print(JP.ocr(image, cls=True)) print(KR.ocr(image, cls=True))