重新生成依赖列表

This commit is contained in:
2023-12-02 02:14:17 +08:00
parent 92921f99eb
commit 990f702c9f
3 changed files with 38 additions and 81 deletions

View File

@@ -7,10 +7,12 @@ import pymysql
import cnocr
import json
import numpy as np
import warnings
from PIL import Image, ImageFile
from dotenv import dotenv_values
from elasticsearch import Elasticsearch
warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True
config = dotenv_values(".env")
oss2.defaults.connection_pool_size = 100
@@ -45,15 +47,17 @@ def save_text(conn, id:int, text:str):
def process_images(conn, ocr, es):
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000")
for id, content in cursor.fetchall():
image = download_image(content)
if image is None:
continue
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
text = ' '.join([x['text'] for x in item])
print(id, text)
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
es.index(index='web_images', id=id, body={'content': text})
conn.commit()
def main():
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)

24
pp.py Executable file
View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = './imgs/14.jpg'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# 显示结果
#from PIL import Image
#result = result[0]
#image = Image.open(img_path).convert('RGB')
#boxes = [line[0] for line in result]
#txts = [line[1][0] for line in result]
#scores = [line[1][1] for line in result]
#im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
#im_show = Image.fromarray(im_show)
#im_show.save('result.jpg')

View File

@@ -1,82 +1,11 @@
aiohttp==3.8.6
aiosignal==1.3.1
appdirs==1.4.4
async-timeout==4.0.3
attrs==23.1.0
certifi==2023.7.22
charset-normalizer==3.3.2
click==8.1.7
cnocr==2.2.4.2
cnstd==1.2.3.5
coloredlogs==15.0.1
contourpy==1.2.0
cycler==0.12.1
docker-pycreds==0.4.0
filelock==3.13.1
flatbuffers==23.5.26
fonttools==4.44.0
frozenlist==1.4.0
fsspec==2023.10.0
gitdb==4.0.11
GitPython==3.1.40
huggingface-hub==0.19.0
humanfriendly==10.0
idna==3.4
Jinja2==3.1.2
kiwisolver==1.4.5
lightning-utilities==0.9.0
MarkupSafe==2.1.3
matplotlib==3.8.1
mpmath==1.3.0
multidict==6.0.4
networkx==3.2.1
numpy==1.26.1
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.18.1
nvidia-nvjitlink-cu12==12.3.52
nvidia-nvtx-cu12==12.1.105
onnx==1.15.0
onnxruntime==1.16.2
opencv-python==4.8.1.78
packaging==23.2
pandas==2.1.3
elasticsearch==8.11.0
numpy==1.26.2
oss2==2.18.3
paddleocr==2.7.0.3
paddleocr.egg==info
Pillow==10.1.0
Pillow==10.1.0
Polygon3==3.0.9.1
protobuf==4.25.0
psutil==5.9.6
pyclipper==1.3.0.post5
PyMySQL==1.1.0
pyparsing==3.1.1
python-dateutil==2.8.2
pytorch-lightning==2.1.1
pytz==2023.3.post1
PyYAML==6.0.1
requests==2.31.0
scipy==1.11.3
seaborn==0.13.0
sentry-sdk==1.34.0
setproctitle==1.3.3
shapely==2.0.2
six==1.16.0
smmap==5.0.1
sympy==1.12
torch==2.1.0+cpu
torchaudio==2.1.0
torchmetrics==1.2.0
torchvision==0.16.0+cpu
tqdm==4.66.1
triton==2.1.0
typing_extensions==4.8.0
tzdata==2023.3
Unidecode==1.3.7
urllib3==2.0.7
wandb==0.16.0
yarl==1.9.2
python-dotenv==1.0.0
Requests==2.31.0