diff --git a/.gitignore b/.gitignore index 5d381cc..0616307 100644 --- a/.gitignore +++ b/.gitignore @@ -129,6 +129,8 @@ ENV/ env.bak/ venv.bak/ +database + # Spyder project settings .spyderproject .spyproject diff --git a/main.py b/main.py old mode 100644 new mode 100755 index 890cd4e..dd1c418 --- a/main.py +++ b/main.py @@ -1,9 +1,35 @@ +#!/usr/bin/env python3 + import io -import oss2 import requests +import oss2 +import plyvel from PIL import Image, ImageFile + +# 创建或打开一个数据库 +db = plyvel.DB('database', create_if_missing=True) + +''' +# 写入一个键值对 +db.put(b'key', b'value') +# 获取一个键的值 +value = db.get(b'key') +# 删除一个键值对 +db.delete(b'key') +# 批量写入 +with db.write_batch() as wb: + for i in range(10000): + wb.put(b'key' + str(i).encode(), b'value' + str(i).encode()) +# 迭代数据库中的所有键值对 +for key, value in db: + print(key, value) +# 关闭数据库 +db.close() +''' + + # 读取 .env from dotenv import dotenv_values config = dotenv_values(".env") @@ -19,39 +45,44 @@ def download_image(url:str) -> Image.Image: oss_auth = oss2.Auth(config['OSS_ACCESS_KEY_ID'], config['OSS_ACCESS_KEY_SECRET']) return Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read())) except Exception: - print('图片下载失败:', url) + print('图片从OSS下载失败:', url) return None else: try: response = requests.get(url) return Image.open(io.BytesIO(response.content)) except Exception: - print('图片下载失败:', url) + print('图片从URL下载失败:', url) return None - import pymysql import pymysql.cursors import cnocr +# 打开 mysql ocr = cnocr.CnOcr(rec_model_name='ch_PP-OCRv3') conn = pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.DictCursor) cursor = conn.cursor() cursor.execute("SELECT id, content FROM web_images LIMIT 5") -# 获取查询结果 -rows = cursor.fetchall() -for row in rows: - image = download_image(row['content']) +# 获取查询结果(跳过下载失败的) +for item in cursor.fetchall(): + image = download_image(item['content']) if image is None: - print('图片下载失败,跳过') continue + # 将只包含那些非空非纯数字且长度大于1的'text'值 out = ocr.ocr(image) - # 这段代码将只包含那些非空、不是纯数字且长度大于1的'text'值 - texts = [item['text'] for item in out if item['text'] and not item['text'].isdigit() and len(item['text']) > 1] - print(row['id'], texts) + texts = [x['text'] for x in out if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] + print(item['id'], texts) + # 将结果存入 leveldb + # db.put(str(row['id']).encode(), ','.join(texts).encode()) # 关闭游标和连接 cursor.close() conn.close() + +# 关闭数据库 +db.close() + +print('Done')