同步
This commit is contained in:
6
main.py
6
main.py
@@ -55,9 +55,9 @@ def process_images(conn, ocr, es):
|
|||||||
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
|
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
|
||||||
text = ' '.join([x['text'] for x in item])
|
text = ' '.join([x['text'] for x in item])
|
||||||
print(id, text)
|
print(id, text)
|
||||||
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
|
#save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
|
||||||
es.index(index='web_images', id=id, body={'content': text})
|
#es.index(index='web_images', id=id, body={'content': text})
|
||||||
conn.commit()
|
#conn.commit()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
|
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
|
||||||
|
12
pp.py
12
pp.py
@@ -7,11 +7,14 @@ import pymysql
|
|||||||
import json
|
import json
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import warnings
|
import warnings
|
||||||
|
import logging
|
||||||
from PIL import Image, ImageFile
|
from PIL import Image, ImageFile
|
||||||
from dotenv import dotenv_values
|
from dotenv import dotenv_values
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
|
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
|
||||||
|
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||||
config = dotenv_values(".env")
|
config = dotenv_values(".env")
|
||||||
@@ -34,8 +37,8 @@ def download_image(url:str) -> Image.Image:
|
|||||||
else:
|
else:
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
return Image.open(io.BytesIO(response.content))
|
return Image.open(io.BytesIO(response.content))
|
||||||
except Exception:
|
except Exception as e:
|
||||||
print(f'图片从{url}下载失败')
|
print(f'图片从{url}下载失败,错误信息为:{e}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def connect_to_mysql():
|
def connect_to_mysql():
|
||||||
@@ -51,11 +54,14 @@ KR = PaddleOCR(use_angle_cls=True, lang="korean")
|
|||||||
|
|
||||||
def process_images(conn, es):
|
def process_images(conn, es):
|
||||||
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
||||||
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10")
|
cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
|
||||||
for id, content in cursor.fetchall():
|
for id, content in cursor.fetchall():
|
||||||
image = download_image(content)
|
image = download_image(content)
|
||||||
if image is None:
|
if image is None:
|
||||||
continue
|
continue
|
||||||
|
if isinstance(image, Image.Image):
|
||||||
|
image = np.array(image)
|
||||||
|
print('---------------------', id, content)
|
||||||
print(CH.ocr(image, cls=True))
|
print(CH.ocr(image, cls=True))
|
||||||
print(JP.ocr(image, cls=True))
|
print(JP.ocr(image, cls=True))
|
||||||
print(KR.ocr(image, cls=True))
|
print(KR.ocr(image, cls=True))
|
||||||
|
Reference in New Issue
Block a user