diff --git a/demo.ipynb b/demo.ipynb deleted file mode 100644 index 5e9844d..0000000 --- a/demo.ipynb +++ /dev/null @@ -1,154 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'id': 0, 'list': [{'text': 'O5U7TQK939', 'xywh': [9, 5, 9, 10]}, {'text': 'CSDR7VUTPG', 'xywh': [3, 9, 10, 9]}, {'text': 'SU4V44QZFH', 'xywh': [2, 6, 4, 4]}, {'text': 'OM2CNFD1K4', 'xywh': [5, 9, 6, 10]}, {'text': 'IV2M8DSCU8', 'xywh': [4, 2, 10, 7]}, {'text': 'A1QLXTAM9E', 'xywh': [2, 2, 4, 5]}, {'text': '0BB746F3KR', 'xywh': [2, 8, 9, 9]}, {'text': 'VX5C2DTUK0', 'xywh': [4, 1, 5, 0]}, {'text': '99R91CHWQH', 'xywh': [8, 2, 7, 6]}, {'text': 'GPSZH1F1EJ', 'xywh': [9, 0, 1, 9]}]}, {'id': 1, 'list': [{'text': 'JGHHC85B7S', 'xywh': [5, 1, 2, 1]}, {'text': 'RDW9CP14DQ', 'xywh': [3, 6, 4, 7]}, {'text': 'UU7HVLJ34K', 'xywh': [10, 4, 2, 5]}, {'text': '1UOY8BBH8C', 'xywh': [1, 8, 2, 9]}, {'text': '6C832LNW83', 'xywh': [6, 5, 7, 8]}, {'text': 'T7VT80L3SQ', 'xywh': [1, 1, 9, 0]}, {'text': 'J04KPQG7N2', 'xywh': [9, 7, 10, 10]}, {'text': 'TWXYGJDRUY', 'xywh': [2, 7, 4, 7]}, {'text': 'T50TTFOC0Q', 'xywh': [2, 5, 2, 1]}, {'text': 'ZNCJBIMYCC', 'xywh': [8, 6, 8, 3]}]}, {'id': 2, 'list': [{'text': 'K74PJ2YA8M', 'xywh': [7, 0, 7, 2]}, {'text': 'PRBXH5UC5R', 'xywh': [2, 10, 7, 3]}, {'text': 'N4YS5UJ214', 'xywh': [2, 1, 9, 4]}, {'text': 'NBAJLP2Z5U', 'xywh': [9, 1, 10, 0]}, {'text': '9LKGLH9V3A', 'xywh': [1, 0, 3, 1]}, {'text': 'EWJJ4V5SPL', 'xywh': [8, 0, 7, 9]}, {'text': '1MI3ZX58OG', 'xywh': [8, 1, 8, 4]}, {'text': 'WI2P0RBM4W', 'xywh': [5, 1, 6, 5]}, {'text': 'CU3UWAI4IV', 'xywh': [1, 8, 10, 10]}, {'text': 'DXK4ZUYM6C', 'xywh': [0, 10, 3, 3]}]}, {'id': 3, 'list': [{'text': 'KX0WNNTDUB', 'xywh': [5, 7, 5, 9]}, {'text': '3XYEVIFTB9', 'xywh': [1, 9, 6, 1]}, {'text': '8175P4NRKI', 'xywh': [4, 3, 4, 3]}, {'text': 'WWZ7T93NH2', 'xywh': [8, 10, 10, 1]}, {'text': '7SASCLCQTD', 'xywh': [2, 1, 0, 6]}, {'text': 'KJHMDGNYLT', 'xywh': [8, 7, 0, 2]}, {'text': 'S0Z0OP6LG0', 'xywh': [6, 8, 5, 1]}, {'text': '2K45XQ74HA', 'xywh': [2, 10, 10, 5]}, {'text': 'FE1T408C5C', 'xywh': [10, 9, 5, 8]}, {'text': 'F5B4QES2ZD', 'xywh': [9, 10, 0, 10]}]}, {'id': 4, 'list': [{'text': 'VQW9253WAK', 'xywh': [7, 1, 1, 5]}, {'text': 'QC2JGP5H8W', 'xywh': [5, 6, 7, 3]}, {'text': '1MV4MPWVA2', 'xywh': [2, 6, 5, 1]}, {'text': 'MYBZXEIY2M', 'xywh': [10, 2, 7, 4]}, {'text': 'B5FPS1Y5QD', 'xywh': [4, 2, 2, 1]}, {'text': 'VUNRLE9Y7K', 'xywh': [9, 1, 6, 6]}, {'text': 'IZ4XLRTT0G', 'xywh': [4, 1, 7, 0]}, {'text': '0OIA0BKV3K', 'xywh': [5, 2, 9, 8]}, {'text': '8HYATQ9DFN', 'xywh': [0, 10, 8, 8]}, {'text': 'YQ4A2J7HG5', 'xywh': [3, 0, 3, 5]}]}, {'id': 5, 'list': [{'text': '94SAIMPTRW', 'xywh': [5, 6, 8, 8]}, {'text': '5CQ7T02S5N', 'xywh': [7, 6, 7, 6]}, {'text': '51OACCWNVH', 'xywh': [8, 9, 1, 4]}, {'text': 'FBNVBQ91BI', 'xywh': [5, 0, 2, 0]}, {'text': 'DA5LLJMJ0R', 'xywh': [2, 8, 10, 9]}, {'text': 'ZH7T7UEUJK', 'xywh': [10, 6, 3, 6]}, {'text': 'DRX3MSMXKK', 'xywh': [9, 10, 0, 1]}, {'text': '07T15S11IW', 'xywh': [9, 3, 9, 5]}, {'text': 'PCWWF7TF88', 'xywh': [4, 10, 1, 7]}, {'text': 'EOWE3G4CIK', 'xywh': [2, 6, 7, 3]}]}, {'id': 6, 'list': [{'text': 'AG0F83XSYY', 'xywh': [3, 5, 1, 3]}, {'text': 'QDBZGIBITF', 'xywh': [3, 3, 10, 1]}, {'text': 'DUZ4072HJM', 'xywh': [3, 10, 5, 6]}, {'text': 'J4AU0W0612', 'xywh': [2, 6, 6, 6]}, {'text': 'U8T1T9K6HW', 'xywh': [4, 10, 5, 1]}, {'text': 'FC078A5VCL', 'xywh': [9, 4, 1, 1]}, {'text': 'IJYAML9F4C', 'xywh': [2, 9, 5, 0]}, {'text': '8QHX8K2872', 'xywh': [9, 8, 10, 10]}, {'text': 'CZDIQ5YPKD', 'xywh': [7, 4, 4, 2]}, {'text': 'UYBVUK20T5', 'xywh': [5, 0, 5, 0]}]}, {'id': 7, 'list': [{'text': 'C4FBFY5S7H', 'xywh': [3, 4, 7, 2]}, {'text': '3G9CY618CL', 'xywh': [5, 7, 2, 1]}, {'text': '8DJMO7R8W1', 'xywh': [1, 5, 1, 3]}, {'text': 'XNZFVDQAMN', 'xywh': [5, 1, 2, 2]}, {'text': '8VV3A0E5HT', 'xywh': [3, 2, 6, 2]}, {'text': 'U5BVGN8TK1', 'xywh': [1, 6, 1, 9]}, {'text': '87V0U64T0H', 'xywh': [1, 2, 7, 9]}, {'text': 'S8PMKTPG50', 'xywh': [1, 2, 5, 4]}, {'text': 'DXINY9FJP0', 'xywh': [1, 3, 6, 9]}, {'text': '4VHG2VJKV2', 'xywh': [2, 9, 6, 9]}]}, {'id': 8, 'list': [{'text': 'CD9AMIPLWN', 'xywh': [2, 7, 1, 9]}, {'text': '5KQHUK2LRQ', 'xywh': [6, 2, 8, 9]}, {'text': 'UYJ2ASBDGF', 'xywh': [3, 9, 10, 10]}, {'text': '3UHYDKWENL', 'xywh': [3, 8, 8, 4]}, {'text': 'FPBGA0MA0D', 'xywh': [3, 9, 3, 1]}, {'text': '0SLSZA9TIU', 'xywh': [9, 8, 2, 1]}, {'text': '56O8MVARB6', 'xywh': [1, 5, 6, 4]}, {'text': 'JMAD5N8DEU', 'xywh': [5, 6, 6, 4]}, {'text': 'XSILUMSLM2', 'xywh': [10, 3, 4, 3]}, {'text': '8MV5CHORRX', 'xywh': [2, 5, 6, 2]}]}, {'id': 9, 'list': [{'text': 'QMEM059TUQ', 'xywh': [9, 8, 10, 6]}, {'text': '1027VBNYJ5', 'xywh': [2, 7, 5, 6]}, {'text': 'Y5XQ4KX0S7', 'xywh': [2, 3, 3, 3]}, {'text': 'JHCUFM3QFK', 'xywh': [3, 7, 2, 8]}, {'text': 'EKPJS33CA5', 'xywh': [10, 8, 5, 5]}, {'text': 'EYTH8YRM0K', 'xywh': [4, 7, 4, 7]}, {'text': 'TZK4M8ZCRC', 'xywh': [10, 4, 10, 10]}, {'text': 'FW51VCOR9R', 'xywh': [8, 10, 8, 5]}, {'text': 'GI10U2T2E0', 'xywh': [7, 0, 0, 10]}, {'text': 'KY9X6OA9P8', 'xywh': [7, 7, 4, 7]}]}]\n" - ] - } - ], - "source": [ - "# 生成数据, 格式如下:\n", - "# [{id: 1, list: [{text:'a', xywh:[1,2,3,4]}, {text:'b', xywh:[1,2,3,4]}]}]\n", - "def create_data(num=100):\n", - " import random\n", - " import string\n", - " data = []\n", - " for i in range(num):\n", - " data.append({\n", - " 'id': i,\n", - " 'list': [{\n", - " 'text': ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)),\n", - " 'xywh': [random.randint(0, num) for _ in range(4)]\n", - " } for _ in range(10)]\n", - " })\n", - " return data\n", - "\n", - "dataset = create_data(10)\n", - "print(dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/satori/ocr/venv/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:397: SecurityWarning: Connecting to 'https://47.102.112.57:9200' using TLS with verify_certs=False is insecure\n", - " _transport = transport_class(\n" - ] - } - ], - "source": [ - "import warnings\n", - "from urllib3.exceptions import InsecureRequestWarning\n", - "warnings.simplefilter('ignore', InsecureRequestWarning)\n", - "\n", - "import json\n", - "from elasticsearch import Elasticsearch\n", - "es = Elasticsearch(\"https://47.102.112.57:9200\", basic_auth=(\"elastic\", \"password\"), verify_certs=False)\n", - "es.indices.delete(index='my_index')\n", - "es.indices.create(index='my_index')\n", - "\n", - "dataset = [{'id': 587, 'content': '[{\"text\": \"福利\", \"score\": 0, \"position\": [[61, 21], [90, 21], [90, 38], [61, 38]]}, {\"text\": \"33.3万\", \"score\": 0, \"position\": [[579, 25], [633, 25], [633, 41], [579, 41]]}, {\"text\": \"2875万\", \"score\": 0, \"position\": [[733, 25], [794, 25], [794, 40], [733, 40]]}, {\"text\": \"好看视频\", \"score\": 0, \"position\": [[925, 30], [1091, 25], [1093, 65], [926, 70]]}, {\"text\": \"勇土领取了首次值第三天的奖励,得了经典坐\", \"score\": 0, \"position\": [[352, 82], [710, 82], [710, 102], [352, 102]]}, {\"text\": \"累计到\", \"score\": 0, \"position\": [[842, 74], [908, 70], [909, 92], [843, 95]]}, {\"text\": \"@搞事丸\", \"score\": 0, \"position\": [[985, 79], [1087, 82], [1086, 108], [984, 104]]}, {\"text\": \"最强\", \"score\": 0, \"position\": [[176, 119], [243, 119], [243, 139], [176, 139]]}, {\"text\": \"第1天\", \"score\": 0, \"position\": [[362, 150], [401, 150], [401, 166], [362, 166]]}, {\"text\": \"第2天\", \"score\": 0, \"position\": [[457, 150], [496, 150], [496, 166], [457, 166]]}, {\"text\": \"第3天\", \"score\": 0, \"position\": [[550, 149], [593, 149], [593, 168], [550, 168]]}, {\"text\": \"第4天\", \"score\": 0, \"position\": [[644, 147], [691, 147], [691, 168], [644, 168]]}, {\"text\": \"第5天\", \"score\": 0, \"position\": [[740, 149], [785, 149], [785, 168], [740, 168]]}, {\"text\": \"周礼\", \"score\": 0, \"position\": [[175, 190], [241, 190], [241, 209], [175, 209]]}, {\"text\": \"到有礼\", \"score\": 0, \"position\": [[175, 260], [243, 260], [243, 280], [175, 280]]}, {\"text\": \"第6天\", \"score\": 0, \"position\": [[361, 272], [401, 272], [401, 290], [361, 290]]}, {\"text\": \"第7天\", \"score\": 0, \"position\": [[458, 273], [496, 273], [496, 290], [458, 290]]}, {\"text\": \"第8天\", \"score\": 0, \"position\": [[553, 273], [590, 273], [590, 289], [553, 289]]}, {\"text\": \"第9天\", \"score\": 0, \"position\": [[649, 273], [686, 273], [686, 289], [649, 289]]}, {\"text\": \"第10天\", \"score\": 0, \"position\": [[736, 270], [787, 270], [787, 290], [736, 290]]}, {\"text\": \"升级有礼\", \"score\": 0, \"position\": [[177, 331], [241, 331], [241, 351], [177, 351]]}, {\"text\": \"充值返还\", \"score\": 0, \"position\": [[179, 402], [243, 402], [243, 422], [179, 422]]}, {\"text\": \"第1\", \"score\": 0, \"position\": [[359, 396], [403, 396], [403, 412], [359, 412]]}, {\"text\": \"第12天\", \"score\": 0, \"position\": [[454, 393], [500, 393], [500, 413], [454, 413]]}, {\"text\": \"第13天\", \"score\": 0, \"position\": [[548, 393], [596, 393], [596, 413], [548, 413]]}, {\"text\": \"第14天\", \"score\": 0, \"position\": [[644, 395], [689, 395], [689, 410], [644, 410]]}, {\"text\": \"第15天\", \"score\": 0, \"position\": [[737, 393], [787, 393], [787, 413], [737, 413]]}, {\"text\": \"手机绑定\", \"score\": 0, \"position\": [[177, 473], [241, 473], [241, 492], [177, 492]]}, {\"text\": \"更多福利\", \"score\": 0, \"position\": [[144, 542], [241, 542], [241, 562], [144, 562]]}, {\"text\": \"到3天\", \"score\": 0, \"position\": [[298, 586], [356, 586], [356, 606], [298, 606]]}, {\"text\": \"签到8天\", \"score\": 0, \"position\": [[419, 586], [477, 586], [477, 606], [419, 606]]}, {\"text\": \"签到14天\", \"score\": 0, \"position\": [[535, 586], [601, 586], [601, 606], [535, 606]]}, {\"text\": \"到21天\", \"score\": 0, \"position\": [[656, 586], [721, 586], [721, 606], [656, 606]]}, {\"text\": \"签到28天\", \"score\": 0, \"position\": [[773, 588], [844, 584], [845, 604], [774, 607]]}]'}, {'id': 606, 'content': '[{\"text\": \"Setu Subtites\", \"score\": 0, \"position\": [[100, 96], [313, 100], [312, 135], [99, 131]]}, {\"text\": \"Subtite Mode\", \"score\": 0, \"position\": [[100, 176], [199, 176], [199, 198], [100, 198]]}, {\"text\": \"None\", \"score\": 0, \"position\": [[555, 184], [589, 184], [589, 206], [555, 206]]}, {\"text\": \"Subtitle Size\", \"score\": 0, \"position\": [[110, 263], [199, 263], [199, 284], [110, 284]]}, {\"text\": \"Smalt Defaut)\", \"score\": 0, \"position\": [[526, 267], [624, 267], [624, 288], [526, 288]]}, {\"text\": \"Subtite Languae\", \"score\": 0, \"position\": [[112, 343], [236, 343], [236, 365], [112, 365]]}, {\"text\": \"Francais\", \"score\": 0, \"position\": [[546, 345], [601, 345], [601, 366], [546, 366]]}, {\"text\": \"Subtite Contrast\", \"score\": 0, \"position\": [[112, 427], [230, 427], [230, 449], [112, 449]]}, {\"text\": \"Chiteis\", \"score\": 0, \"position\": [[95, 494], [404, 488], [404, 515], [96, 522]]}, {\"text\": \"Next ECHAP Back\", \"score\": 0, \"position\": [[101, 588], [294, 584], [295, 611], [102, 616]]}]'}, {'id': 767, 'content': '[{\"text\": \"Apple\", \"score\": 0, \"position\": [[826, 272], [958, 276], [957, 320], [825, 316]]}, {\"text\": \"Annihilator\", \"score\": 0, \"position\": [[772, 327], [1016, 330], [1015, 368], [772, 364]]}, {\"text\": \"Dees\", \"score\": 0, \"position\": [[752, 426], [864, 426], [864, 452], [752, 452]]}, {\"text\": \"58/100\", \"score\": 0, \"position\": [[939, 518], [1008, 518], [1008, 551], [939, 551]]}, {\"text\": \"Men\", \"score\": 0, \"position\": [[105, 595], [153, 595], [153, 619], [105, 619]]}]'}, {'id': 766, 'content': '[{\"text\": \"Bluebery Slime\", \"score\": 0, \"position\": [[511, 139], [706, 139], [706, 178], [511, 178]]}, {\"text\": \"Petiteandterifiedofbeingetalon\", \"score\": 0, \"position\": [[617, 205], [877, 205], [877, 226], [617, 226]]}, {\"text\": \"Bueberriesmoveinpacksand sticktotheir\", \"score\": 0, \"position\": [[616, 238], [917, 238], [917, 259], [616, 259]]}, {\"text\": \"peed\", \"score\": 0, \"position\": [[506, 279], [552, 279], [552, 303], [506, 303]]}, {\"text\": \"targets suroundin them and preventinthein\", \"score\": 0, \"position\": [[614, 271], [933, 271], [933, 292], [614, 292]]}]'}, {'id': 774, 'content': '[{\"text\": \"植物学家\", \"score\": 0, \"position\": [[1444, 451], [1538, 451], [1538, 480], [1444, 480]]}, {\"text\": \"阿尔法\", \"score\": 0, \"position\": [[1253, 494], [1532, 494], [1532, 578], [1253, 578]]}, {\"text\": \"可复活一次\", \"score\": 0, \"position\": [[1367, 706], [1487, 706], [1487, 739], [1367, 739]]}, {\"text\": \"技能:召唤豌豆射手\", \"score\": 0, \"position\": [[1280, 743], [1481, 743], [1481, 770], [1280, 770]]}, {\"text\": \"转到设置\", \"score\": 0, \"position\": [[1328, 790], [1381, 790], [1381, 804], [1328, 804]]}]'}]\n", - "\n", - "# 生成索引\n", - "for data in dataset:\n", - " content = json.loads(data[\"content\"])\n", - " texts = []\n", - " for item in content:\n", - " texts.append(item[\"text\"])\n", - " doc_id = f\"{data['id']}\"\n", - " es.index(index=\"my_index\", id=doc_id, body={\"content\": ' '.join(texts)})\n", - "\n", - "# 生成索引\n", - "#for data in dataset:\n", - "# for item in data[\"list\"]:\n", - "# doc_id = f\"{data['id']}_{item['text']}\"\n", - "# es.index(index=\"my_index\", id=doc_id, body=item)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},\n", - " 'hits': {'hits': [{'_id': '774',\n", - " '_index': 'my_index',\n", - " '_score': 3.2047122,\n", - " '_source': {'content': '植物学家 阿尔法 可复活一次 技能:召唤豌豆射手 转到设置'}},\n", - " {'_id': '587',\n", - " '_index': 'my_index',\n", - " '_score': 2.2945156,\n", - " '_source': {'content': '福利 33.3万 2875万 好看视频 '\n", - " '勇土领取了首次值第三天的奖励,得了经典坐 累计到 @搞事丸 最强 '\n", - " '第1天 第2天 第3天 第4天 第5天 周礼 到有礼 第6天 第7天 '\n", - " '第8天 第9天 第10天 升级有礼 充值返还 第1 第12天 '\n", - " '第13天 第14天 第15天 手机绑定 更多福利 到3天 签到8天 '\n", - " '签到14天 到21天 签到28天'}}],\n", - " 'max_score': 3.2047122,\n", - " 'total': {'relation': 'eq', 'value': 2}},\n", - " 'timed_out': False,\n", - " 'took': 0}\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/satori/ocr/venv/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:397: SecurityWarning: Connecting to 'https://47.102.112.57:9200' using TLS with verify_certs=False is insecure\n", - " _transport = transport_class(\n" - ] - } - ], - "source": [ - "import warnings\n", - "from pprint import pprint\n", - "from urllib3.exceptions import InsecureRequestWarning\n", - "warnings.simplefilter('ignore', InsecureRequestWarning)\n", - "\n", - "from elasticsearch import Elasticsearch\n", - "es = Elasticsearch(\"https://47.102.112.57:9200\", basic_auth=(\"elastic\", \"password\"), verify_certs=False)\n", - "\n", - "# 搜索\n", - "res = es.search(index=\"my_index\", body={\"query\": {\"match\": {\"content\": \"豌豆 福利\"}}})\n", - "pprint(res.body)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/demo.py b/demo.py deleted file mode 100644 index c462a78..0000000 --- a/demo.py +++ /dev/null @@ -1,21 +0,0 @@ -import pymysql -import pymysql.cursors -from dotenv import dotenv_values -from pprint import pprint - -config = dotenv_values(".env") -conn = pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.DictCursor) -cursor = conn.cursor() -cursor.execute("SELECT * FROM web_images WHERE id=1436682 LIMIT 5") - -# 获取查询结果 -rows = cursor.fetchall() -for row in rows: - # 格式化打印 - pprint(row) - - - -# 关闭游标和连接 -cursor.close() -conn.close() diff --git a/main.py b/main.py index 5826be1..d0a10b6 100755 --- a/main.py +++ b/main.py @@ -3,65 +3,17 @@ import io import requests import oss2 -import plyvel - -from PIL import Image, ImageFile -from pprint import pprint -from dotenv import dotenv_values - -ImageFile.LOAD_TRUNCATED_IMAGES = True - -config = dotenv_values(".env") -db = plyvel.DB('database', create_if_missing=True) - -''' -# 写入一个键值对 -db.put(b'key', b'value') -# 获取一个键的值 -value = db.get(b'key') -# 删除一个键值对 -db.delete(b'key') -# 批量写入 -with db.write_batch() as wb: - for i in range(10000): - wb.put(b'key' + str(i).encode(), b'value' + str(i).encode()) -# 迭代数据库中的所有键值对 -for key, value in db: - print(key, value) -# 关闭数据库 -db.close() -''' - -# 下载图片(使用OSS下载) -def download_image(url:str) -> Image.Image: - if url.startswith('http://image.gameuiux.cn/') or url.startswith('https://image.gameuiux.cn/'): - try: - url = url.replace('http://image.gameuiux.cn/', '').replace('https://image.gameuiux.cn/', '') - oss2.defaults.connection_pool_size = 100 - oss_auth = oss2.Auth(config['OSS_ACCESS_KEY_ID'], config['OSS_ACCESS_KEY_SECRET']) - return Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read())) - except Exception: - print('图片从OSS下载失败:', url) - return None - else: - try: - response = requests.get(url) - return Image.open(io.BytesIO(response.content)) - except Exception: - print('图片从URL下载失败:', url) - return None - import pymysql -import pymysql.cursors import cnocr import json import numpy as np +from PIL import Image, ImageFile +from dotenv import dotenv_values +from elasticsearch import Elasticsearch -# 打开 mysql -ocr = cnocr.CnOcr(rec_model_name='ch_PP-OCRv3') -conn = pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.DictCursor) -cursor = conn.cursor() -cursor.execute("SELECT id, content FROM web_images LIMIT 5") +ImageFile.LOAD_TRUNCATED_IMAGES = True +config = dotenv_values(".env") +oss2.defaults.connection_pool_size = 100 class MyEncoder(json.JSONEncoder): def default(self, obj): @@ -71,35 +23,45 @@ class MyEncoder(json.JSONEncoder): return obj.astype(int).tolist() return super(MyEncoder, self).default(obj) -dataset = [] +def download_image(url:str) -> Image.Image: + try: + if url.startswith('http://image.gameuiux.cn/') or url.startswith('https://image.gameuiux.cn/'): + url = url.replace('http://image.gameuiux.cn/', '').replace('https://image.gameuiux.cn/', '') + oss_auth = oss2.Auth(config['OSS_ACCESS_KEY_ID'], config['OSS_ACCESS_KEY_SECRET']) + return Image.open(io.BytesIO(oss2.Bucket(oss_auth, f'http://{config["OSS_HOST"]}', config['OSS_BUCKET_NAME']).get_object(url).read())) + else: + response = requests.get(url) + return Image.open(io.BytesIO(response.content)) + except Exception: + print(f'图片从{url}下载失败') + return None -# 获取查询结果(跳过下载失败的) -for item in cursor.fetchall(): - image = download_image(item['content']) - if image is None: - continue - # 将只包含那些非空非纯数字且长度大于1的'text'值 - out = ocr.ocr(image) - out = [x for x in out if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] - #print(item['id'], json.dumps(out, ensure_ascii=False, cls=MyEncoder)) - dataset.append({ - 'id': item['id'], - 'content': json.dumps(out, ensure_ascii=False, cls=MyEncoder) - }) - - #texts = [x['text'] for x in out if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] - #print(item['id'], texts) +def connect_to_mysql(): + return pymysql.connect(host=config['MYSQL_HOST'], user=config['MYSQL_USER'], password=config['MYSQL_PASSWORD'], database=config['MYSQL_NAME'], cursorclass=pymysql.cursors.SSDictCursor) - # 将结果存入 leveldb - # db.put(str(row['id']).encode(), ','.join(texts).encode()) +def save_text(conn, id:int, text:str): + with conn.cursor() as cursor: + cursor.execute("UPDATE web_images SET text = %s WHERE id = %s", (text, id)) -print(dataset) +def process_images(conn, ocr, es): + with conn.cursor(pymysql.cursors.SSCursor) as cursor: + cursor.execute("SELECT id, content, text FROM web_images WHERE text!='' LIMIT 10") + for id, content, text in cursor.fetchall(): + image = download_image(content) + if image is None: + continue + item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1] + save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder)) + texts = ' '.join([x['text'] for x in item]) + es.index(index='web_images', id=id, body={'content': texts}) -# 关闭游标和连接 -cursor.close() -conn.close() +def main(): + es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False) + if not es.indices.exists(index='web_images'): + es.indices.create(index='web_images') + ocr = cnocr.CnOcr(rec_model_name='ch_PP-OCRv3') + conn = connect_to_mysql() + process_images(conn, ocr, es) -# 关闭数据库 -db.close() - -print('Done') +if __name__ == "__main__": + main() diff --git a/x.jpg b/x.jpg deleted file mode 100644 index 46f02ec..0000000 Binary files a/x.jpg and /dev/null differ