存储到zinc

This commit is contained in:
2023-12-09 00:21:09 +08:00
parent c1257c6d29
commit 77589044c9

37
pp.py
View File

@@ -1,25 +1,33 @@
#!/usr/bin/env python3
import io
import requests
import oss2
import pymysql
import json
import base64
import dotenv
import pymysql
import requests
import numpy as np
import warnings
import logging
from PIL import Image, ImageFile
from dotenv import dotenv_values
from elasticsearch import Elasticsearch
from paddleocr import PaddleOCR
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
warnings.filterwarnings("ignore")
ImageFile.LOAD_TRUNCATED_IMAGES = True
config = dotenv_values(".env")
oss2.defaults.connection_pool_size = 100
config = dotenv.dotenv_values(".env")
user = config['ZINCSEARCH_USER']
password = config['ZINCSEARCH_PASSWORD']
zinc_host = config['ZINCSEARCH_HOST']
index = config['ZINCSEARCH_INDEX']
bas64encoded_creds = base64.b64encode(bytes(f"{user}:{password}", "utf-8")).decode("utf-8")
headers = {"Content-type": "application/json", "Authorization": f"Basic {bas64encoded_creds}"}
zinc_url = f"{zinc_host}/api/{index}/_doc"
class MyEncoder(json.JSONEncoder):
def default(self, obj):
@@ -59,8 +67,7 @@ def connect_to_mysql():
def save_text(conn, id: int, text: str):
with conn.cursor() as cursor:
cursor.execute(
"UPDATE web_images SET text = %s WHERE id = %s", (text, id))
cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id))
# 中英日韩俄
@@ -70,9 +77,9 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
KR = PaddleOCR(use_angle_cls=True, lang="korean")
RU = PaddleOCR(use_angle_cls=True, lang="ru")
def process_images(conn, es):
def process_images(conn):
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 0,1000")
cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10")
for id, content in cursor.fetchall():
image = download_image(content)
if image is None:
@@ -174,19 +181,15 @@ def process_images(conn, es):
print(id, text)
save_text(conn, id, data)
es.index(index='web_images', id=id, body={'content': text})
res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''})
print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text)
conn.commit()
def main():
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(
config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
if not es.indices.exists(index='web_images'):
es.indices.create(index='web_images')
conn = connect_to_mysql()
process_images(conn, es)
process_images(conn)
if __name__ == "__main__":
for _ in range(1000):
for _ in range(1):
main()