This commit is contained in:
2023-12-02 03:50:05 +08:00
parent 655fc8c1c0
commit 0dbd957454
2 changed files with 12 additions and 6 deletions

View File

@@ -55,9 +55,9 @@ def process_images(conn, ocr, es):
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
text = ' '.join([x['text'] for x in item]) text = ' '.join([x['text'] for x in item])
print(id, text) print(id, text)
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) #save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
es.index(index='web_images', id=id, body={'content': text}) #es.index(index='web_images', id=id, body={'content': text})
conn.commit() #conn.commit()
def main(): def main():
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)

12
pp.py
View File

@@ -7,11 +7,14 @@ import pymysql
import json import json
import numpy as np import numpy as np
import warnings import warnings
import logging
from PIL import Image, ImageFile from PIL import Image, ImageFile
from dotenv import dotenv_values from dotenv import dotenv_values
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True ImageFile.LOAD_TRUNCATED_IMAGES = True
config = dotenv_values(".env") config = dotenv_values(".env")
@@ -34,8 +37,8 @@ def download_image(url:str) -> Image.Image:
else: else:
response = requests.get(url) response = requests.get(url)
return Image.open(io.BytesIO(response.content)) return Image.open(io.BytesIO(response.content))
except Exception: except Exception as e:
print(f'图片从{url}下载失败') print(f'图片从{url}下载失败,错误信息为:{e}')
return None return None
def connect_to_mysql(): def connect_to_mysql():
@@ -51,11 +54,14 @@ KR = PaddleOCR(use_angle_cls=True, lang="korean")
def process_images(conn, es): def process_images(conn, es):
with conn.cursor(pymysql.cursors.SSCursor) as cursor: with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10") cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
for id, content in cursor.fetchall(): for id, content in cursor.fetchall():
image = download_image(content) image = download_image(content)
if image is None: if image is None:
continue continue
if isinstance(image, Image.Image):
image = np.array(image)
print('---------------------', id, content)
print(CH.ocr(image, cls=True)) print(CH.ocr(image, cls=True))
print(JP.ocr(image, cls=True)) print(JP.ocr(image, cls=True))
print(KR.ocr(image, cls=True)) print(KR.ocr(image, cls=True))