存储到zinc
This commit is contained in:
37
pp.py
37
pp.py
@@ -1,25 +1,33 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import requests
|
|
||||||
import oss2
|
import oss2
|
||||||
import pymysql
|
|
||||||
import json
|
import json
|
||||||
|
import base64
|
||||||
|
import dotenv
|
||||||
|
import pymysql
|
||||||
|
import requests
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import warnings
|
import warnings
|
||||||
import logging
|
import logging
|
||||||
from PIL import Image, ImageFile
|
from PIL import Image, ImageFile
|
||||||
from dotenv import dotenv_values
|
|
||||||
from elasticsearch import Elasticsearch
|
|
||||||
from paddleocr import PaddleOCR
|
from paddleocr import PaddleOCR
|
||||||
|
|
||||||
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
|
logging.disable(logging.DEBUG) # 关闭DEBUG日志的打印
|
||||||
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
|
logging.disable(logging.WARNING) # 关闭WARNING日志的打印
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||||
config = dotenv_values(".env")
|
|
||||||
oss2.defaults.connection_pool_size = 100
|
oss2.defaults.connection_pool_size = 100
|
||||||
|
|
||||||
|
config = dotenv.dotenv_values(".env")
|
||||||
|
user = config['ZINCSEARCH_USER']
|
||||||
|
password = config['ZINCSEARCH_PASSWORD']
|
||||||
|
zinc_host = config['ZINCSEARCH_HOST']
|
||||||
|
index = config['ZINCSEARCH_INDEX']
|
||||||
|
bas64encoded_creds = base64.b64encode(bytes(f"{user}:{password}", "utf-8")).decode("utf-8")
|
||||||
|
headers = {"Content-type": "application/json", "Authorization": f"Basic {bas64encoded_creds}"}
|
||||||
|
zinc_url = f"{zinc_host}/api/{index}/_doc"
|
||||||
|
|
||||||
|
|
||||||
class MyEncoder(json.JSONEncoder):
|
class MyEncoder(json.JSONEncoder):
|
||||||
def default(self, obj):
|
def default(self, obj):
|
||||||
@@ -59,8 +67,7 @@ def connect_to_mysql():
|
|||||||
|
|
||||||
def save_text(conn, id: int, text: str):
|
def save_text(conn, id: int, text: str):
|
||||||
with conn.cursor() as cursor:
|
with conn.cursor() as cursor:
|
||||||
cursor.execute(
|
cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id))
|
||||||
"UPDATE web_images SET text = %s WHERE id = %s", (text, id))
|
|
||||||
|
|
||||||
|
|
||||||
# 中英日韩俄
|
# 中英日韩俄
|
||||||
@@ -70,9 +77,9 @@ JP = PaddleOCR(use_angle_cls=True, lang="japan")
|
|||||||
KR = PaddleOCR(use_angle_cls=True, lang="korean")
|
KR = PaddleOCR(use_angle_cls=True, lang="korean")
|
||||||
RU = PaddleOCR(use_angle_cls=True, lang="ru")
|
RU = PaddleOCR(use_angle_cls=True, lang="ru")
|
||||||
|
|
||||||
def process_images(conn, es):
|
def process_images(conn):
|
||||||
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
with conn.cursor(pymysql.cursors.SSCursor) as cursor:
|
||||||
cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 0,1000")
|
cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10")
|
||||||
for id, content in cursor.fetchall():
|
for id, content in cursor.fetchall():
|
||||||
image = download_image(content)
|
image = download_image(content)
|
||||||
if image is None:
|
if image is None:
|
||||||
@@ -174,19 +181,15 @@ def process_images(conn, es):
|
|||||||
print(id, text)
|
print(id, text)
|
||||||
|
|
||||||
save_text(conn, id, data)
|
save_text(conn, id, data)
|
||||||
es.index(index='web_images', id=id, body={'content': text})
|
res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''})
|
||||||
|
print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(
|
|
||||||
config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
|
|
||||||
if not es.indices.exists(index='web_images'):
|
|
||||||
es.indices.create(index='web_images')
|
|
||||||
conn = connect_to_mysql()
|
conn = connect_to_mysql()
|
||||||
process_images(conn, es)
|
process_images(conn)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
for _ in range(1000):
|
for _ in range(1):
|
||||||
main()
|
main()
|
||||||
|
Reference in New Issue
Block a user