From 655fc8c1c0aec71ac070dc90350105def923264e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A7=89?= Date: Sat, 2 Dec 2023 02:41:36 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=9A=E8=AF=AD=E8=A8=80=E8=B0=83=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + pp.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 85 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 0616307..c9a743c 100644 --- a/.gitignore +++ b/.gitignore @@ -121,6 +121,7 @@ celerybeat.pid *.sage.py # Environments +data .env .venv env/ diff --git a/pp.py b/pp.py index 67caaae..1b334e8 100755 --- a/pp.py +++ b/pp.py @@ -1,15 +1,91 @@ #!/usr/bin/env python3 -from paddleocr import PaddleOCR, draw_ocr + +import io +import requests +import oss2 +import pymysql +import json +import numpy as np +import warnings +from PIL import Image, ImageFile +from dotenv import dotenv_values +from elasticsearch import Elasticsearch +from paddleocr import PaddleOCR + +warnings.filterwarnings("ignore") +ImageFile.LOAD_TRUNCATED_IMAGES = True +config = dotenv_values(".env") +oss2.defaults.connection_pool_size = 100 + +class MyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.float32): + return int(obj) + if isinstance(obj, np.ndarray): + return obj.astype(int).tolist() + return super(MyEncoder, self).default(obj) + +def download_image(url:str) -> Image.Image: + try: + if url.startswith('http://image.gameuiux.cn/') or url.startswith('https://image.gameuiux.cn/'): + url = url.replace('http://image.gameuiux.cn/', '').replace('https://image.gameuiux.cn/', '') + oss_auth = oss2.Auth(config['OSS_ACCESS_KEY_ID'], config['OSS_ACCESS_KEY_SECRET']) + return Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read())) + else: + response = requests.get(url) + return Image.open(io.BytesIO(response.content)) + except Exception: + print(f'图片从{url}下载失败') + return None + +def connect_to_mysql(): + return pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.SSDictCursor) + +def save_text(conn, id:int, text:str): + with conn.cursor() as cursor: + cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id)) + +CH = PaddleOCR(use_angle_cls=True, lang="ch") +JP = PaddleOCR(use_angle_cls=True, lang="japan") +KR = PaddleOCR(use_angle_cls=True, lang="korean") + +def process_images(conn, es): + with conn.cursor(pymysql.cursors.SSCursor) as cursor: + cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10") + for id, content in cursor.fetchall(): + image = download_image(content) + if image is None: + continue + print(CH.ocr(image, cls=True)) + print(JP.ocr(image, cls=True)) + print(KR.ocr(image, cls=True)) + # item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] + # text = ' '.join([x['text'] for x in item]) + # print(id, text) + # save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) + # es.index(index='web_images', id=id, body={'content': text}) + #conn.commit() + +def main(): + es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) + if not es.indices.exists(index='web_images'): + es.indices.create(index='web_images') + conn = connect_to_mysql() + process_images(conn, es) + +if __name__ == "__main__": + main() + # Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 # 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` -ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory -img_path = './imgs/14.jpg' -result = ocr.ocr(img_path, cls=True) -for idx in range(len(result)): - res = result[idx] - for line in res: - print(line) +#ocr = PaddleOCR(use_angle_cls=True, lang="ch") +#img_path = './imgs/14.jpg' +#result = ocr.ocr(img_path, cls=True) +#for idx in range(len(result)): +# res = result[idx] +# for line in res: +# print(line) # 显示结果 #from PIL import Image @@ -21,4 +97,3 @@ for idx in range(len(result)): #im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf') #im_show = Image.fromarray(im_show) #im_show.save('result.jpg') -