重新生成依赖列表

This commit is contained in:
2023-12-02 02:14:17 +08:00
parent 92921f99eb
commit 990f702c9f
3 changed files with 38 additions and 81 deletions

View File

@@ -7,10 +7,12 @@ import pymysql
import cnocr
import json
import numpy as np
import warnings
from PIL import Image, ImageFile
from dotenv import dotenv_values
from elasticsearch import Elasticsearch
warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True
config = dotenv_values(".env")
oss2.defaults.connection_pool_size = 100
@@ -45,15 +47,17 @@ def save_text(conn, id:int, text:str):
def process_images(conn, ocr, es):
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000")
for id, content in cursor.fetchall():
image = download_image(content)
if image is None:
continue
item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
text = ' '.join([x['text'] for x in item])
print(id, text)
save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
es.index(index='web_images', id=id, body={'content': text})
conn.commit()
def main():
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)