重新生成依赖列表
This commit is contained in:
8
main.py
8
main.py
@@ -7,10 +7,12 @@ import pymysql
|
|||||||
import cnocr
|
import cnocr
|
||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import warnings
|
||||||
from PIL import Image, ImageFile
|
from PIL import Image, ImageFile
|
||||||
from dotenv import dotenv_values
|
from dotenv import dotenv_values
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||||
config = dotenv_values(".env")
|
config = dotenv_values(".env")
|
||||||
oss2.defaults.connection_pool_size = 100
|
oss2.defaults.connection_pool_size = 100
|
||||||
@@ -45,15 +47,17 @@ def save_text(conn, id:int, text:str):
|
|||||||
|
|
||||||
def process_images(conn, ocr, es):
|
def process_images(conn, ocr, es):
|
||||||
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
||||||
cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
|
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000")
|
||||||
for id, content in cursor.fetchall():
|
for id, content in cursor.fetchall():
|
||||||
image = download_image(content)
|
image = download_image(content)
|
||||||
if image is None:
|
if image is None:
|
||||||
continue
|
continue
|
||||||
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
|
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
|
||||||
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
|
|
||||||
text = ' '.join([x['text'] for x in item])
|
text = ' '.join([x['text'] for x in item])
|
||||||
|
print(id, text)
|
||||||
|
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
|
||||||
es.index(index='web_images', id=id, body={'content': text})
|
es.index(index='web_images', id=id, body={'content': text})
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
|
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
|
||||||
|
24
pp.py
Executable file
24
pp.py
Executable file
@@ -0,0 +1,24 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from paddleocr import PaddleOCR, draw_ocr
|
||||||
|
|
||||||
|
# Paddleocr目前支持的多语言语种可以通过修改lang参数进行切换
|
||||||
|
# 例如`ch`, `en`, `fr`, `german`, `korean`, `japan`
|
||||||
|
ocr = PaddleOCR(use_angle_cls=True, lang="ch") # need to run only once to download and load model into memory
|
||||||
|
img_path = './imgs/14.jpg'
|
||||||
|
result = ocr.ocr(img_path, cls=True)
|
||||||
|
for idx in range(len(result)):
|
||||||
|
res = result[idx]
|
||||||
|
for line in res:
|
||||||
|
print(line)
|
||||||
|
|
||||||
|
# 显示结果
|
||||||
|
#from PIL import Image
|
||||||
|
#result = result[0]
|
||||||
|
#image = Image.open(img_path).convert('RGB')
|
||||||
|
#boxes = [line[0] for line in result]
|
||||||
|
#txts = [line[1][0] for line in result]
|
||||||
|
#scores = [line[1][1] for line in result]
|
||||||
|
#im_show = draw_ocr(image, boxes, txts, scores, font_path='./fonts/simfang.ttf')
|
||||||
|
#im_show = Image.fromarray(im_show)
|
||||||
|
#im_show.save('result.jpg')
|
||||||
|
|
@@ -1,82 +1,11 @@
|
|||||||
aiohttp==3.8.6
|
|
||||||
aiosignal==1.3.1
|
|
||||||
appdirs==1.4.4
|
|
||||||
async-timeout==4.0.3
|
|
||||||
attrs==23.1.0
|
|
||||||
certifi==2023.7.22
|
|
||||||
charset-normalizer==3.3.2
|
|
||||||
click==8.1.7
|
|
||||||
cnocr==2.2.4.2
|
cnocr==2.2.4.2
|
||||||
cnstd==1.2.3.5
|
elasticsearch==8.11.0
|
||||||
coloredlogs==15.0.1
|
numpy==1.26.2
|
||||||
contourpy==1.2.0
|
oss2==2.18.3
|
||||||
cycler==0.12.1
|
paddleocr==2.7.0.3
|
||||||
docker-pycreds==0.4.0
|
paddleocr.egg==info
|
||||||
filelock==3.13.1
|
Pillow==10.1.0
|
||||||
flatbuffers==23.5.26
|
|
||||||
fonttools==4.44.0
|
|
||||||
frozenlist==1.4.0
|
|
||||||
fsspec==2023.10.0
|
|
||||||
gitdb==4.0.11
|
|
||||||
GitPython==3.1.40
|
|
||||||
huggingface-hub==0.19.0
|
|
||||||
humanfriendly==10.0
|
|
||||||
idna==3.4
|
|
||||||
Jinja2==3.1.2
|
|
||||||
kiwisolver==1.4.5
|
|
||||||
lightning-utilities==0.9.0
|
|
||||||
MarkupSafe==2.1.3
|
|
||||||
matplotlib==3.8.1
|
|
||||||
mpmath==1.3.0
|
|
||||||
multidict==6.0.4
|
|
||||||
networkx==3.2.1
|
|
||||||
numpy==1.26.1
|
|
||||||
nvidia-cublas-cu12==12.1.3.1
|
|
||||||
nvidia-cuda-cupti-cu12==12.1.105
|
|
||||||
nvidia-cuda-nvrtc-cu12==12.1.105
|
|
||||||
nvidia-cuda-runtime-cu12==12.1.105
|
|
||||||
nvidia-cudnn-cu12==8.9.2.26
|
|
||||||
nvidia-cufft-cu12==11.0.2.54
|
|
||||||
nvidia-curand-cu12==10.3.2.106
|
|
||||||
nvidia-cusolver-cu12==11.4.5.107
|
|
||||||
nvidia-cusparse-cu12==12.1.0.106
|
|
||||||
nvidia-nccl-cu12==2.18.1
|
|
||||||
nvidia-nvjitlink-cu12==12.3.52
|
|
||||||
nvidia-nvtx-cu12==12.1.105
|
|
||||||
onnx==1.15.0
|
|
||||||
onnxruntime==1.16.2
|
|
||||||
opencv-python==4.8.1.78
|
|
||||||
packaging==23.2
|
|
||||||
pandas==2.1.3
|
|
||||||
Pillow==10.1.0
|
Pillow==10.1.0
|
||||||
Polygon3==3.0.9.1
|
|
||||||
protobuf==4.25.0
|
|
||||||
psutil==5.9.6
|
|
||||||
pyclipper==1.3.0.post5
|
|
||||||
PyMySQL==1.1.0
|
PyMySQL==1.1.0
|
||||||
pyparsing==3.1.1
|
python-dotenv==1.0.0
|
||||||
python-dateutil==2.8.2
|
Requests==2.31.0
|
||||||
pytorch-lightning==2.1.1
|
|
||||||
pytz==2023.3.post1
|
|
||||||
PyYAML==6.0.1
|
|
||||||
requests==2.31.0
|
|
||||||
scipy==1.11.3
|
|
||||||
seaborn==0.13.0
|
|
||||||
sentry-sdk==1.34.0
|
|
||||||
setproctitle==1.3.3
|
|
||||||
shapely==2.0.2
|
|
||||||
six==1.16.0
|
|
||||||
smmap==5.0.1
|
|
||||||
sympy==1.12
|
|
||||||
torch==2.1.0+cpu
|
|
||||||
torchaudio==2.1.0
|
|
||||||
torchmetrics==1.2.0
|
|
||||||
torchvision==0.16.0+cpu
|
|
||||||
tqdm==4.66.1
|
|
||||||
triton==2.1.0
|
|
||||||
typing_extensions==4.8.0
|
|
||||||
tzdata==2023.3
|
|
||||||
Unidecode==1.3.7
|
|
||||||
urllib3==2.0.7
|
|
||||||
wandb==0.16.0
|
|
||||||
yarl==1.9.2
|
|
||||||
|
Reference in New Issue
Block a user