From 77589044c91d8583ea6959a39136ea91fbfc1688 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=A7=89?= <huan0016@gmail.com>
Date: Sat, 9 Dec 2023 00:21:09 +0800
Subject: [PATCH] =?UTF-8?q?=E5=AD=98=E5=82=A8=E5=88=B0zinc?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pp.py | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/pp.py b/pp.py
index 73fc6fc..00cfa81 100755
--- a/pp.py
+++ b/pp.py
@@ -1,25 +1,33 @@
 #!/usr/bin/env python3
 
 import io
-import requests
 import oss2
-import pymysql
 import json
+import base64
+import dotenv
+import pymysql
+import requests
 import numpy as np
 import warnings
 import logging
 from PIL import Image, ImageFile
-from dotenv import dotenv_values
-from elasticsearch import Elasticsearch
 from paddleocr import PaddleOCR
 
 logging.disable(logging.DEBUG)    # 关闭DEBUG日志的打印
 logging.disable(logging.WARNING)  # 关闭WARNING日志的打印
 warnings.filterwarnings("ignore")
 ImageFile.LOAD_TRUNCATED_IMAGES = True
-config = dotenv_values(".env")
 oss2.defaults.connection_pool_size = 100
 
+config = dotenv.dotenv_values(".env")
+user = config['ZINCSEARCH_USER']
+password = config['ZINCSEARCH_PASSWORD']
+zinc_host = config['ZINCSEARCH_HOST']
+index = config['ZINCSEARCH_INDEX']
+bas64encoded_creds = base64.b64encode(bytes(f"{user}:{password}", "utf-8")).decode("utf-8")
+headers = {"Content-type": "application/json", "Authorization": f"Basic {bas64encoded_creds}"}
+zinc_url = f"{zinc_host}/api/{index}/_doc"
+
 
 class MyEncoder(json.JSONEncoder):
     def default(self, obj):
@@ -59,8 +67,7 @@ def connect_to_mysql():
 
 def save_text(conn, id: int, text: str):
     with conn.cursor() as cursor:
-        cursor.execute(
-            "UPDATE web_images SET text = %s WHERE id = %s", (text, id))
+        cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id))
 
 
 # 中英日韩俄
@@ -70,9 +77,9 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
 KR = PaddleOCR(use_angle_cls=True, lang="korean")
 RU = PaddleOCR(use_angle_cls=True, lang="ru")
 
-def process_images(conn, es):
+def process_images(conn):
     with conn.cursor(pymysql.cursors.SSCursor) as cursor:
-        cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 0,1000")
+        cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10")
         for id, content in cursor.fetchall():
             image = download_image(content)
             if image is None:
@@ -174,19 +181,15 @@ def process_images(conn, es):
             print(id, text)
 
             save_text(conn, id, data)
-            es.index(index='web_images', id=id, body={'content': text})
+            res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''})
+            print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text)
             conn.commit()
 
-
 def main():
-    es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(
-        config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
-    if not es.indices.exists(index='web_images'):
-        es.indices.create(index='web_images')
     conn = connect_to_mysql()
-    process_images(conn, es)
+    process_images(conn)
 
 
 if __name__ == "__main__":
-    for _ in range(1000):
+    for _ in range(1):
         main()