存储到zinc

2023-12-09 00:21:09 +08:00
parent c1257c6d29
commit 77589044c9
1 changed files with 20 additions and 17 deletions
--- a/pp.py
+++ b/pp.py
@@ -1,25 +1,33 @@
 #!/usr/bin/env python3
 import io
 import requests
 import oss2
 import pymysql
 import json
 import base64
 import dotenv
 import pymysql
 import requests
 import numpy as np
 import warnings
 import logging
 from PIL import Image, ImageFile
 from dotenv import dotenv_values
 from elasticsearch import Elasticsearch
 from paddleocr import PaddleOCR
 logging.disable(logging.DEBUG)    # 关闭DEBUG日志的打印
 logging.disable(logging.WARNING)  # 关闭WARNING日志的打印
 warnings.filterwarnings("ignore")
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 config = dotenv_values(".env")
 oss2.defaults.connection_pool_size = 100
 config = dotenv.dotenv_values(".env")
 user = config['ZINCSEARCH_USER']
 password = config['ZINCSEARCH_PASSWORD']
 zinc_host = config['ZINCSEARCH_HOST']
 index = config['ZINCSEARCH_INDEX']
 bas64encoded_creds = base64.b64encode(bytes(f"{user}:{password}", "utf-8")).decode("utf-8")
 headers = {"Content-type": "application/json", "Authorization": f"Basic {bas64encoded_creds}"}
 zinc_url = f"{zinc_host}/api/{index}/_doc"
 class MyEncoder(json.JSONEncoder):
    def default(self, obj):
@@ -59,8 +67,7 @@ def connect_to_mysql():
 def save_text(conn, id: int, text: str):
    with conn.cursor() as cursor:
-        cursor.execute(
+        cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id))
            "UPDATE web_images SET text = %s WHERE id = %s", (text, id))
 # 中英日韩俄
@@ -70,9 +77,9 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
 KR = PaddleOCR(use_angle_cls=True, lang="korean")
 RU = PaddleOCR(use_angle_cls=True, lang="ru")
-def process_images(conn, es):
+def process_images(conn):
    with conn.cursor(pymysql.cursors.SSCursor) as cursor:
-        cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 0,1000")
+        cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10")
        for id, content in cursor.fetchall():
            image = download_image(content)
            if image is None:
@@ -174,19 +181,15 @@ def process_images(conn, es):
            print(id, text)
            save_text(conn, id, data)
-            es.index(index='web_images', id=id, body={'content': text})
+            res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''})
            print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text)
            conn.commit()
 def main():
    es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(
        config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
    if not es.indices.exists(index='web_images'):
        es.indices.create(index='web_images')
    conn = connect_to_mysql()
-    process_images(conn, es)
+    process_images(conn)
 if __name__ == "__main__":
-    for _ in range(1000):
+    for _ in range(1):
        main()