重新生成依赖列表
This commit is contained in:
8
main.py
8
main.py
@@ -7,10 +7,12 @@ import pymysql
|
||||
import cnocr
|
||||
import json
|
||||
import numpy as np
|
||||
import warnings
|
||||
from PIL import Image, ImageFile
|
||||
from dotenv import dotenv_values
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||
config = dotenv_values(".env")
|
||||
oss2.defaults.connection_pool_size = 100
|
||||
@@ -45,15 +47,17 @@ def save_text(conn, id:int, text:str):
|
||||
|
||||
def process_images(conn, ocr, es):
|
||||
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
||||
cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
|
||||
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000")
|
||||
for id, content in cursor.fetchall():
|
||||
image = download_image(content)
|
||||
if image is None:
|
||||
continue
|
||||
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
|
||||
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
|
||||
text = ' '.join([x['text'] for x in item])
|
||||
print(id, text)
|
||||
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
|
||||
es.index(index='web_images', id=id, body={'content': text})
|
||||
conn.commit()
|
||||
|
||||
def main():
|
||||
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
|
||||
|
24
pp.py
Executable file
24
pp.py
Executable file
@@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
from paddleocr import PaddleOCR, draw_ocr
|
||||
|
||||
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
|
||||
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
|
||||
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
|
||||
img_path = './imgs/14.jpg'
|
||||
result = ocr.ocr(img_path, cls=True)
|
||||
for idx in range(len(result)):
|
||||
res = result[idx]
|
||||
for line in res:
|
||||
print(line)
|
||||
|
||||
# 显示结果
|
||||
#from PIL import Image
|
||||
#result = result[0]
|
||||
#image = Image.open(img_path).convert('RGB')
|
||||
#boxes = [line[0] for line in result]
|
||||
#txts = [line[1][0] for line in result]
|
||||
#scores = [line[1][1] for line in result]
|
||||
#im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
|
||||
#im_show = Image.fromarray(im_show)
|
||||
#im_show.save('result.jpg')
|
||||
|
@@ -1,82 +1,11 @@
|
||||
aiohttp==3.8.6
|
||||
aiosignal==1.3.1
|
||||
appdirs==1.4.4
|
||||
async-timeout==4.0.3
|
||||
attrs==23.1.0
|
||||
certifi==2023.7.22
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
cnocr==2.2.4.2
|
||||
cnstd==1.2.3.5
|
||||
coloredlogs==15.0.1
|
||||
contourpy==1.2.0
|
||||
cycler==0.12.1
|
||||
docker-pycreds==0.4.0
|
||||
filelock==3.13.1
|
||||
flatbuffers==23.5.26
|
||||
fonttools==4.44.0
|
||||
frozenlist==1.4.0
|
||||
fsspec==2023.10.0
|
||||
gitdb==4.0.11
|
||||
GitPython==3.1.40
|
||||
huggingface-hub==0.19.0
|
||||
humanfriendly==10.0
|
||||
idna==3.4
|
||||
Jinja2==3.1.2
|
||||
kiwisolver==1.4.5
|
||||
lightning-utilities==0.9.0
|
||||
MarkupSafe==2.1.3
|
||||
matplotlib==3.8.1
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.4
|
||||
networkx==3.2.1
|
||||
numpy==1.26.1
|
||||
nvidia-cublas-cu12==12.1.3.1
|
||||
nvidia-cuda-cupti-cu12==12.1.105
|
||||
nvidia-cuda-nvrtc-cu12==12.1.105
|
||||
nvidia-cuda-runtime-cu12==12.1.105
|
||||
nvidia-cudnn-cu12==8.9.2.26
|
||||
nvidia-cufft-cu12==11.0.2.54
|
||||
nvidia-curand-cu12==10.3.2.106
|
||||
nvidia-cusolver-cu12==11.4.5.107
|
||||
nvidia-cusparse-cu12==12.1.0.106
|
||||
nvidia-nccl-cu12==2.18.1
|
||||
nvidia-nvjitlink-cu12==12.3.52
|
||||
nvidia-nvtx-cu12==12.1.105
|
||||
onnx==1.15.0
|
||||
onnxruntime==1.16.2
|
||||
opencv-python==4.8.1.78
|
||||
packaging==23.2
|
||||
pandas==2.1.3
|
||||
elasticsearch==8.11.0
|
||||
numpy==1.26.2
|
||||
oss2==2.18.3
|
||||
paddleocr==2.7.0.3
|
||||
paddleocr.egg==info
|
||||
Pillow==10.1.0
|
||||
Pillow==10.1.0
|
||||
Polygon3==3.0.9.1
|
||||
protobuf==4.25.0
|
||||
psutil==5.9.6
|
||||
pyclipper==1.3.0.post5
|
||||
PyMySQL==1.1.0
|
||||
pyparsing==3.1.1
|
||||
python-dateutil==2.8.2
|
||||
pytorch-lightning==2.1.1
|
||||
pytz==2023.3.post1
|
||||
PyYAML==6.0.1
|
||||
requests==2.31.0
|
||||
scipy==1.11.3
|
||||
seaborn==0.13.0
|
||||
sentry-sdk==1.34.0
|
||||
setproctitle==1.3.3
|
||||
shapely==2.0.2
|
||||
six==1.16.0
|
||||
smmap==5.0.1
|
||||
sympy==1.12
|
||||
torch==2.1.0+cpu
|
||||
torchaudio==2.1.0
|
||||
torchmetrics==1.2.0
|
||||
torchvision==0.16.0+cpu
|
||||
tqdm==4.66.1
|
||||
triton==2.1.0
|
||||
typing_extensions==4.8.0
|
||||
tzdata==2023.3
|
||||
Unidecode==1.3.7
|
||||
urllib3==2.0.7
|
||||
wandb==0.16.0
|
||||
yarl==1.9.2
|
||||
python-dotenv==1.0.0
|
||||
Requests==2.31.0
|
||||
|
Reference in New Issue
Block a user