存储到zinc

This commit is contained in:
2023-12-09 00:21:09 +08:00
parent c1257c6d29
commit 77589044c9

37
pp.py
View File

@@ -1,25 +1,33 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import io import io
import requests
import oss2 import oss2
import pymysql
import json import json
import base64
import dotenv
import pymysql
import requests
import numpy as np import numpy as np
import warnings import warnings
import logging import logging
from PIL import Image, ImageFile from PIL import Image, ImageFile
from dotenv import dotenv_values
from elasticsearch import Elasticsearch
from paddleocr import PaddleOCR from paddleocr import PaddleOCR
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印 logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
logging.disable(logging.WARNING) # 关闭WARNING日志的打印 logging.disable(logging.WARNING) # 关闭WARNING日志的打印
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True ImageFile.LOAD_TRUNCATED_IMAGES = True
config = dotenv_values(".env")
oss2.defaults.connection_pool_size = 100 oss2.defaults.connection_pool_size = 100
config = dotenv.dotenv_values(".env")
user = config['ZINCSEARCH_USER']
password = config['ZINCSEARCH_PASSWORD']
zinc_host = config['ZINCSEARCH_HOST']
index = config['ZINCSEARCH_INDEX']
bas64encoded_creds = base64.b64encode(bytes(f"{user}:{password}", "utf-8")).decode("utf-8")
headers = {"Content-type": "application/json", "Authorization": f"Basic {bas64encoded_creds}"}
zinc_url = f"{zinc_host}/api/{index}/_doc"
class MyEncoder(json.JSONEncoder): class MyEncoder(json.JSONEncoder):
def default(self, obj): def default(self, obj):
@@ -59,8 +67,7 @@ def connect_to_mysql():
def save_text(conn, id: int, text: str): def save_text(conn, id: int, text: str):
with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute( cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id))
"UPDATE web_images SET text = %s WHERE id = %s", (text, id))
# 中英日韩俄 # 中英日韩俄
@@ -70,9 +77,9 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
KR = PaddleOCR(use_angle_cls=True, lang="korean") KR = PaddleOCR(use_angle_cls=True, lang="korean")
RU = PaddleOCR(use_angle_cls=True, lang="ru") RU = PaddleOCR(use_angle_cls=True, lang="ru")
def process_images(conn, es): def process_images(conn):
with conn.cursor(pymysql.cursors.SSCursor) as cursor: with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 0,1000") cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10")
for id, content in cursor.fetchall(): for id, content in cursor.fetchall():
image = download_image(content) image = download_image(content)
if image is None: if image is None:
@@ -174,19 +181,15 @@ def process_images(conn, es):
print(id, text) print(id, text)
save_text(conn, id, data) save_text(conn, id, data)
es.index(index='web_images', id=id, body={'content': text}) res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''})
print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text)
conn.commit() conn.commit()
def main(): def main():
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(
config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
if not es.indices.exists(index='web_images'):
es.indices.create(index='web_images')
conn = connect_to_mysql() conn = connect_to_mysql()
process_images(conn, es) process_images(conn)
if __name__ == "__main__": if __name__ == "__main__":
for _ in range(1000): for _ in range(1):
main() main()