From 990f702c9fb6956593589dd1918cfe387f1013f4 Mon Sep 17 00:00:00 2001 From: satori Date: Sat, 2 Dec 2023 02:14:17 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=96=B0=E7=94=9F=E6=88=90=E4=BE=9D?= =?UTF-8?q?=E8=B5=96=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 8 +++-- pp.py | 24 +++++++++++++ requirements.txt | 87 +++++------------------------------------------- 3 files changed, 38 insertions(+), 81 deletions(-) create mode 100755 pp.py diff --git a/main.py b/main.py index 5804cd3..48ad249 100755 --- a/main.py +++ b/main.py @@ -7,10 +7,12 @@ import pymysql import cnocr import json import numpy as np +import warnings from PIL import Image, ImageFile from dotenv import dotenv_values from elasticsearch import Elasticsearch +warnings.filterwarnings("ignore") ImageFile.LOAD_TRUNCATED_IMAGES = True config = dotenv_values(".env") oss2.defaults.connection_pool_size = 100 @@ -45,15 +47,17 @@ def save_text(conn, id:int, text:str): def process_images(conn, ocr, es): with conn.cursor(pymysql.cursors.SSCursor) as cursor: - cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10") + cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000") for id, content in cursor.fetchall(): image = download_image(content) if image is None: continue item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] - save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) text = ' '.join([x['text'] for x in item]) + print(id, text) + save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) es.index(index='web_images', id=id, body={'content': text}) + conn.commit() def main(): es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) diff --git a/pp.py b/pp.py new file mode 100755 index 0000000..67caaae --- /dev/null +++ b/pp.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +from paddleocr import PaddleOCR, draw_ocr + +# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换 +# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan` +ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory +img_path = './imgs/14.jpg' +result = ocr.ocr(img_path, cls=True) +for idx in range(len(result)): + res = result[idx] + for line in res: + print(line) + +# 显示结果 +#from PIL import Image +#result = result[0] +#image = Image.open(img_path).convert('RGB') +#boxes = [line[0] for line in result] +#txts = [line[1][0] for line in result] +#scores = [line[1][1] for line in result] +#im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf') +#im_show = Image.fromarray(im_show) +#im_show.save('result.jpg') + diff --git a/requirements.txt b/requirements.txt index dbdaa71..3b90f65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,82 +1,11 @@ -aiohttp==3.8.6 -aiosignal==1.3.1 -appdirs==1.4.4 -async-timeout==4.0.3 -attrs==23.1.0 -certifi==2023.7.22 -charset-normalizer==3.3.2 -click==8.1.7 cnocr==2.2.4.2 -cnstd==1.2.3.5 -coloredlogs==15.0.1 -contourpy==1.2.0 -cycler==0.12.1 -docker-pycreds==0.4.0 -filelock==3.13.1 -flatbuffers==23.5.26 -fonttools==4.44.0 -frozenlist==1.4.0 -fsspec==2023.10.0 -gitdb==4.0.11 -GitPython==3.1.40 -huggingface-hub==0.19.0 -humanfriendly==10.0 -idna==3.4 -Jinja2==3.1.2 -kiwisolver==1.4.5 -lightning-utilities==0.9.0 -MarkupSafe==2.1.3 -matplotlib==3.8.1 -mpmath==1.3.0 -multidict==6.0.4 -networkx==3.2.1 -numpy==1.26.1 -nvidia-cublas-cu12==12.1.3.1 -nvidia-cuda-cupti-cu12==12.1.105 -nvidia-cuda-nvrtc-cu12==12.1.105 -nvidia-cuda-runtime-cu12==12.1.105 -nvidia-cudnn-cu12==8.9.2.26 -nvidia-cufft-cu12==11.0.2.54 -nvidia-curand-cu12==10.3.2.106 -nvidia-cusolver-cu12==11.4.5.107 -nvidia-cusparse-cu12==12.1.0.106 -nvidia-nccl-cu12==2.18.1 -nvidia-nvjitlink-cu12==12.3.52 -nvidia-nvtx-cu12==12.1.105 -onnx==1.15.0 -onnxruntime==1.16.2 -opencv-python==4.8.1.78 -packaging==23.2 -pandas==2.1.3 +elasticsearch==8.11.0 +numpy==1.26.2 +oss2==2.18.3 +paddleocr==2.7.0.3 +paddleocr.egg==info +Pillow==10.1.0 Pillow==10.1.0 -Polygon3==3.0.9.1 -protobuf==4.25.0 -psutil==5.9.6 -pyclipper==1.3.0.post5 PyMySQL==1.1.0 -pyparsing==3.1.1 -python-dateutil==2.8.2 -pytorch-lightning==2.1.1 -pytz==2023.3.post1 -PyYAML==6.0.1 -requests==2.31.0 -scipy==1.11.3 -seaborn==0.13.0 -sentry-sdk==1.34.0 -setproctitle==1.3.3 -shapely==2.0.2 -six==1.16.0 -smmap==5.0.1 -sympy==1.12 -torch==2.1.0+cpu -torchaudio==2.1.0 -torchmetrics==1.2.0 -torchvision==0.16.0+cpu -tqdm==4.66.1 -triton==2.1.0 -typing_extensions==4.8.0 -tzdata==2023.3 -Unidecode==1.3.7 -urllib3==2.0.7 -wandb==0.16.0 -yarl==1.9.2 +python-dotenv==1.0.0 +Requests==2.31.0