重新生成依赖列表

This commit is contained in:
2023-12-02 02:14:17 +08:00
parent 92921f99eb
commit 990f702c9f
3 changed files with 38 additions and 81 deletions

View File

@@ -7,10 +7,12 @@ import pymysql
import cnocr import cnocr
import json import json
import numpy as np import numpy as np
import warnings
from PIL import Image, ImageFile from PIL import Image, ImageFile
from dotenv import dotenv_values from dotenv import dotenv_values
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True ImageFile.LOAD_TRUNCATED_IMAGES = True
config = dotenv_values(".env") config = dotenv_values(".env")
oss2.defaults.connection_pool_size = 100 oss2.defaults.connection_pool_size = 100
@@ -45,15 +47,17 @@ def save_text(conn, id:int, text:str):
def process_images(conn, ocr, es): def process_images(conn, ocr, es):
with conn.cursor(pymysql.cursors.SSCursor) as cursor: with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10") cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000")
for id, content in cursor.fetchall(): for id, content in cursor.fetchall():
image = download_image(content) image = download_image(content)
if image is None: if image is None:
continue continue
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
text = ' '.join([x['text'] for x in item]) text = ' '.join([x['text'] for x in item])
print(id, text)
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
es.index(index='web_images', id=id, body={'content': text}) es.index(index='web_images', id=id, body={'content': text})
conn.commit()
def main(): def main():
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)

24
pp.py Executable file
View File

@@ -0,0 +1,24 @@
#!/usr/bin/env python3
from paddleocr import PaddleOCR, draw_ocr
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
img_path = './imgs/14.jpg'
result = ocr.ocr(img_path, cls=True)
for idx in range(len(result)):
res = result[idx]
for line in res:
print(line)
# 显示结果
#from PIL import Image
#result = result[0]
#image = Image.open(img_path).convert('RGB')
#boxes = [line[0] for line in result]
#txts = [line[1][0] for line in result]
#scores = [line[1][1] for line in result]
#im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
#im_show = Image.fromarray(im_show)
#im_show.save('result.jpg')

View File

@@ -1,82 +1,11 @@
aiohttp==3.8.6
aiosignal==1.3.1
appdirs==1.4.4
async-timeout==4.0.3
attrs==23.1.0
certifi==2023.7.22
charset-normalizer==3.3.2
click==8.1.7
cnocr==2.2.4.2 cnocr==2.2.4.2
cnstd==1.2.3.5 elasticsearch==8.11.0
coloredlogs==15.0.1 numpy==1.26.2
contourpy==1.2.0 oss2==2.18.3
cycler==0.12.1 paddleocr==2.7.0.3
docker-pycreds==0.4.0 paddleocr.egg==info
filelock==3.13.1 Pillow==10.1.0
flatbuffers==23.5.26
fonttools==4.44.0
frozenlist==1.4.0
fsspec==2023.10.0
gitdb==4.0.11
GitPython==3.1.40
huggingface-hub==0.19.0
humanfriendly==10.0
idna==3.4
Jinja2==3.1.2
kiwisolver==1.4.5
lightning-utilities==0.9.0
MarkupSafe==2.1.3
matplotlib==3.8.1
mpmath==1.3.0
multidict==6.0.4
networkx==3.2.1
numpy==1.26.1
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.18.1
nvidia-nvjitlink-cu12==12.3.52
nvidia-nvtx-cu12==12.1.105
onnx==1.15.0
onnxruntime==1.16.2
opencv-python==4.8.1.78
packaging==23.2
pandas==2.1.3
Pillow==10.1.0 Pillow==10.1.0
Polygon3==3.0.9.1
protobuf==4.25.0
psutil==5.9.6
pyclipper==1.3.0.post5
PyMySQL==1.1.0 PyMySQL==1.1.0
pyparsing==3.1.1 python-dotenv==1.0.0
python-dateutil==2.8.2 Requests==2.31.0
pytorch-lightning==2.1.1
pytz==2023.3.post1
PyYAML==6.0.1
requests==2.31.0
scipy==1.11.3
seaborn==0.13.0
sentry-sdk==1.34.0
setproctitle==1.3.3
shapely==2.0.2
six==1.16.0
smmap==5.0.1
sympy==1.12
torch==2.1.0+cpu
torchaudio==2.1.0
torchmetrics==1.2.0
torchvision==0.16.0+cpu
tqdm==4.66.1
triton==2.1.0
typing_extensions==4.8.0
tzdata==2023.3
Unidecode==1.3.7
urllib3==2.0.7
wandb==0.16.0
yarl==1.9.2