From 92921f99eb5a552ca4d4960c4fb5f1fb6d644af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=A7=89?= Date: Fri, 1 Dec 2023 02:39:13 +0800 Subject: [PATCH] =?UTF-8?q?=E7=A7=BB=E9=99=A4=20text=20=E9=A2=84=E8=A7=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/settings.json | 3 +++ main.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..14f6030 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "editor.inlineSuggest.showToolbar": "onHover" +} \ No newline at end of file diff --git a/main.py b/main.py index d0a10b6..5804cd3 100755 --- a/main.py +++ b/main.py @@ -45,15 +45,15 @@ def save_text(conn, id:int, text:str): def process_images(conn, ocr, es): with conn.cursor(pymysql.cursors.SSCursor) as cursor: - cursor.execute("SELECT id, content, text FROM web_images WHERE text!='' LIMIT 10") - for id, content, text in cursor.fetchall(): + cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10") + for id, content in cursor.fetchall(): image = download_image(content) if image is None: continue item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) - texts = ' '.join([x['text'] for x in item]) - es.index(index='web_images', id=id, body={'content': texts}) + text = ' '.join([x['text'] for x in item]) + es.index(index='web_images', id=id, body={'content': text}) def main(): es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)