From 77589044c91d8583ea6959a39136ea91fbfc1688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A7=89?= Date: Sat, 9 Dec 2023 00:21:09 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AD=98=E5=82=A8=E5=88=B0zinc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pp.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/pp.py b/pp.py index 73fc6fc..00cfa81 100755 --- a/pp.py +++ b/pp.py @@ -1,25 +1,33 @@ #!/usr/bin/env python3 import io -import requests import oss2 -import pymysql import json +import base64 +import dotenv +import pymysql +import requests import numpy as np import warnings import logging from PIL import Image, ImageFile -from dotenv import dotenv_values -from elasticsearch import Elasticsearch from paddleocr import PaddleOCR logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印 logging.disable(logging.WARNING) # 关闭WARNING日志的打印 warnings.filterwarnings("ignore") ImageFile.LOAD_TRUNCATED_IMAGES = True -config = dotenv_values(".env") oss2.defaults.connection_pool_size = 100 +config = dotenv.dotenv_values(".env") +user = config['ZINCSEARCH_USER'] +password = config['ZINCSEARCH_PASSWORD'] +zinc_host = config['ZINCSEARCH_HOST'] +index = config['ZINCSEARCH_INDEX'] +bas64encoded_creds = base64.b64encode(bytes(f"{user}:{password}", "utf-8")).decode("utf-8") +headers = {"Content-type": "application/json", "Authorization": f"Basic {bas64encoded_creds}"} +zinc_url = f"{zinc_host}/api/{index}/_doc" + class MyEncoder(json.JSONEncoder): def default(self, obj): @@ -59,8 +67,7 @@ def connect_to_mysql(): def save_text(conn, id: int, text: str): with conn.cursor() as cursor: - cursor.execute( - "UPDATE web_images SET text = %s WHERE id = %s", (text, id)) + cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id)) # 中英日韩俄 @@ -70,9 +77,9 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan") KR = PaddleOCR(use_angle_cls=True, lang="korean") RU = PaddleOCR(use_angle_cls=True, lang="ru") -def process_images(conn, es): +def process_images(conn): with conn.cursor(pymysql.cursors.SSCursor) as cursor: - cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 0,1000") + cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10") for id, content in cursor.fetchall(): image = download_image(content) if image is None: @@ -174,19 +181,15 @@ def process_images(conn, es): print(id, text) save_text(conn, id, data) - es.index(index='web_images', id=id, body={'content': text}) + res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''}) + print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text) conn.commit() - def main(): - es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=( - config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) - if not es.indices.exists(index='web_images'): - es.indices.create(index='web_images') conn = connect_to_mysql() - process_images(conn, es) + process_images(conn) if __name__ == "__main__": - for _ in range(1000): + for _ in range(1): main()