From f3a5d44c5778974156276451a59a322abe9fd508 Mon Sep 17 00:00:00 2001
From: satori <huan0016@gmail.com>
Date: Tue, 5 Dec 2023 03:10:46 +0800
Subject: [PATCH] =?UTF-8?q?=E5=BD=92=E5=B9=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md        | 10 +++++--
 main.py          | 71 ------------------------------------------------
 pp.py            |  5 +++-
 requirements.txt |  2 --
 4 files changed, 12 insertions(+), 76 deletions(-)
 delete mode 100755 main.py

diff --git a/README.md b/README.md
index afc7aea..e7d3016 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,9 @@
-# ocr
+# OCR
+
+基于深度学习的文字识别提取标记
+- 由于当前没有较优的语言分类识别方案, 使用四倍算力换精度
+- 当前支持 英文 中文 日文 韩文 俄文 的识别
+- 去除纯数字和单字符以及置信度低于80的文字
+- 数据转json存储于mysql web_images 每张图像对应的 text 字段
+- 文字以空格分隔合并为字符串加入 Elasticsearch 索引
 
-基于深度学习的文字识别提取标记
\ No newline at end of file
diff --git a/main.py b/main.py
deleted file mode 100755
index c524ec6..0000000
--- a/main.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-
-import io
-import requests
-import oss2
-import pymysql
-import cnocr
-import json
-import numpy as np
-import warnings
-from PIL import Image, ImageFile
-from dotenv import dotenv_values
-from elasticsearch import Elasticsearch
-
-warnings.filterwarnings("ignore")
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-config = dotenv_values(".env")
-oss2.defaults.connection_pool_size = 100
-
-class MyEncoder(json.JSONEncoder):
-    def default(self, obj):
-        if isinstance(obj, np.float32):
-            return int(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.astype(int).tolist()
-        return super(MyEncoder, self).default(obj)
-
-def download_image(url:str) -> Image.Image:
-    try:
-        if url.startswith('http://image.gameuiux.cn/') or url.startswith('https://image.gameuiux.cn/'):
-            url = url.replace('http://image.gameuiux.cn/', '').replace('https://image.gameuiux.cn/', '')
-            oss_auth = oss2.Auth(config['OSS_ACCESS_KEY_ID'], config['OSS_ACCESS_KEY_SECRET'])
-            return Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read()))
-        else:
-            response = requests.get(url)
-            return Image.open(io.BytesIO(response.content))
-    except Exception:
-        print(f'图片从{url}下载失败')
-        return None
-
-def connect_to_mysql():
-    return pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.SSDictCursor)
-
-def save_text(conn, id:int, text:str):
-    with conn.cursor() as cursor:
-        cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id))
-
-def process_images(conn, ocr, es):
-    with conn.cursor(pymysql.cursors.SSCursor) as cursor:
-        cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10000")
-        for id, content in cursor.fetchall():
-            image = download_image(content)
-            if image is None:
-                continue
-            item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
-            text = ' '.join([x['text'] for x in item])
-            print(id, text)
-            #save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
-            #es.index(index='web_images', id=id, body={'content': text})
-        #conn.commit()
-
-def main():
-    es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
-    if not es.indices.exists(index='web_images'):
-        es.indices.create(index='web_images')
-    ocr = cnocr.CnOcr(rec_model_name='ch_PP-OCRv3')
-    conn = connect_to_mysql()
-    process_images(conn, ocr, es)
-
-if __name__ == "__main__":
-    main()
diff --git a/pp.py b/pp.py
index b4f0850..73fc6fc 100755
--- a/pp.py
+++ b/pp.py
@@ -30,7 +30,7 @@ class MyEncoder(json.JSONEncoder):
         return super(MyEncoder, self).default(obj)
 
 
-def download_image(url: str) -> Image.Image:
+def download_image(url: str, max_size=32767) -> Image.Image:
     if url.endswith('.gif') or url.endswith('.GIF'):
         print(f'跳过GIF {url}')
         return None
@@ -44,6 +44,9 @@ def download_image(url: str) -> Image.Image:
             img = Image.open(io.BytesIO(response.content))
         if img.mode != 'RGB':
             img = img.convert('RGB')
+        if img.size[0] > max_size or img.size[1] > max_size:
+            print(f'跳过尺寸过大的图像 {url}')
+            return None
         return img
     except Exception as e:
         print(f'图片从{url}下载失败，错误信息为：{e}')
diff --git a/requirements.txt b/requirements.txt
index 3e4b7d9..2545c47 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,10 @@
 whell==0.42.0
-cnocr==2.2.4.2
 elasticsearch==8.11.0
 numpy==1.26.2
 oss2==2.18.3
 paddleocr==2.7.0.3
 paddleocr.egg==info
 Pillow==10.1.0
-Pillow==10.1.0
 PyMySQL==1.1.0
 python-dotenv==1.0.0
 Requests==2.31.0