diff --git a/pp.py b/pp.py index 00cfa81..0e95779 100755 --- a/pp.py +++ b/pp.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 +import os import io import oss2 +import time import json import base64 import dotenv @@ -46,7 +48,11 @@ def download_image(url: str, max_size=32767) -> Image.Image: if url.startswith('http://image.gameuiux.cn/') or url.startswith('https://image.gameuiux.cn/'): url = url.replace('http://image.gameuiux.cn/', '').replace('https://image.gameuiux.cn/', '') oss_auth = oss2.Auth(config['OSS_ACCESS_KEY_ID'], config['OSS_ACCESS_KEY_SECRET']) - img = Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read())) + if os.path.exists(url): + img = Image.open(url) + else: + print(f'从OSS下载图片 {url}') + img = Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read())) else: response = requests.get(url) img = Image.open(io.BytesIO(response.content)) @@ -65,10 +71,6 @@ def connect_to_mysql(): return pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.SSDictCursor) -def save_text(conn, id: int, text: str): - with conn.cursor() as cursor: - cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id)) - # 中英日韩俄 EN = PaddleOCR(use_angle_cls=True, lang="en") @@ -79,7 +81,7 @@ RU = PaddleOCR(use_angle_cls=True, lang="ru") def process_images(conn): with conn.cursor(pymysql.cursors.SSCursor) as cursor: - cursor.execute("SELECT id, content FROM web_images WHERE id>222193 AND text='' LIMIT 10") + cursor.execute("SELECT id, content FROM web_images WHERE text='' AND text!='[]' AND article_category_top_id=22 LIMIT 10") for id, content in cursor.fetchall(): image = download_image(content) if image is None: @@ -173,23 +175,22 @@ def process_images(conn): data.append({'text': 文本[0], 'confidence': 文本[1], 'coordinate': 坐标 }) # 转换为字符串存储到索引库 - text = ' '.join([x['text'] for x in data]) + obj = { "_id": str(id), "text": ' '.join([x['text'] for x in data]) } + res = requests.put(zinc_url, headers=headers, data=json.dumps(obj), proxies={'http': '', 'https': ''}) + print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text) # 转换为 JSON 存储到数据库 - data = json.dumps(data, ensure_ascii=False, cls=MyEncoder) - - print(id, text) - - save_text(conn, id, data) - res = requests.put(zinc_url, headers=headers, data=json.dumps(data), proxies={'http': '', 'https': ''}) - print("\033[1;32m{}\033[0m".format(id) if json.loads(res.text)['message'] == 'ok' else id, text) - conn.commit() + with conn.cursor() as cursor: + data = json.dumps(data, ensure_ascii=False, cls=MyEncoder) + cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (data, id)) + conn.commit() def main(): conn = connect_to_mysql() - process_images(conn) + while True: + process_images(conn) + time.sleep(10) if __name__ == "__main__": - for _ in range(1): - main() + main()