155 lines
16 KiB
Plaintext
155 lines
16 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[{'id': 0, 'list': [{'text': 'O5U7TQK939', 'xywh': [9, 5, 9, 10]}, {'text': 'CSDR7VUTPG', 'xywh': [3, 9, 10, 9]}, {'text': 'SU4V44QZFH', 'xywh': [2, 6, 4, 4]}, {'text': 'OM2CNFD1K4', 'xywh': [5, 9, 6, 10]}, {'text': 'IV2M8DSCU8', 'xywh': [4, 2, 10, 7]}, {'text': 'A1QLXTAM9E', 'xywh': [2, 2, 4, 5]}, {'text': '0BB746F3KR', 'xywh': [2, 8, 9, 9]}, {'text': 'VX5C2DTUK0', 'xywh': [4, 1, 5, 0]}, {'text': '99R91CHWQH', 'xywh': [8, 2, 7, 6]}, {'text': 'GPSZH1F1EJ', 'xywh': [9, 0, 1, 9]}]}, {'id': 1, 'list': [{'text': 'JGHHC85B7S', 'xywh': [5, 1, 2, 1]}, {'text': 'RDW9CP14DQ', 'xywh': [3, 6, 4, 7]}, {'text': 'UU7HVLJ34K', 'xywh': [10, 4, 2, 5]}, {'text': '1UOY8BBH8C', 'xywh': [1, 8, 2, 9]}, {'text': '6C832LNW83', 'xywh': [6, 5, 7, 8]}, {'text': 'T7VT80L3SQ', 'xywh': [1, 1, 9, 0]}, {'text': 'J04KPQG7N2', 'xywh': [9, 7, 10, 10]}, {'text': 'TWXYGJDRUY', 'xywh': [2, 7, 4, 7]}, {'text': 'T50TTFOC0Q', 'xywh': [2, 5, 2, 1]}, {'text': 'ZNCJBIMYCC', 'xywh': [8, 6, 8, 3]}]}, {'id': 2, 'list': [{'text': 'K74PJ2YA8M', 'xywh': [7, 0, 7, 2]}, {'text': 'PRBXH5UC5R', 'xywh': [2, 10, 7, 3]}, {'text': 'N4YS5UJ214', 'xywh': [2, 1, 9, 4]}, {'text': 'NBAJLP2Z5U', 'xywh': [9, 1, 10, 0]}, {'text': '9LKGLH9V3A', 'xywh': [1, 0, 3, 1]}, {'text': 'EWJJ4V5SPL', 'xywh': [8, 0, 7, 9]}, {'text': '1MI3ZX58OG', 'xywh': [8, 1, 8, 4]}, {'text': 'WI2P0RBM4W', 'xywh': [5, 1, 6, 5]}, {'text': 'CU3UWAI4IV', 'xywh': [1, 8, 10, 10]}, {'text': 'DXK4ZUYM6C', 'xywh': [0, 10, 3, 3]}]}, {'id': 3, 'list': [{'text': 'KX0WNNTDUB', 'xywh': [5, 7, 5, 9]}, {'text': '3XYEVIFTB9', 'xywh': [1, 9, 6, 1]}, {'text': '8175P4NRKI', 'xywh': [4, 3, 4, 3]}, {'text': 'WWZ7T93NH2', 'xywh': [8, 10, 10, 1]}, {'text': '7SASCLCQTD', 'xywh': [2, 1, 0, 6]}, {'text': 'KJHMDGNYLT', 'xywh': [8, 7, 0, 2]}, {'text': 'S0Z0OP6LG0', 'xywh': [6, 8, 5, 1]}, {'text': '2K45XQ74HA', 'xywh': [2, 10, 10, 5]}, {'text': 'FE1T408C5C', 'xywh': [10, 9, 5, 8]}, {'text': 'F5B4QES2ZD', 'xywh': [9, 10, 0, 10]}]}, {'id': 4, 'list': [{'text': 'VQW9253WAK', 'xywh': [7, 1, 1, 5]}, {'text': 'QC2JGP5H8W', 'xywh': [5, 6, 7, 3]}, {'text': '1MV4MPWVA2', 'xywh': [2, 6, 5, 1]}, {'text': 'MYBZXEIY2M', 'xywh': [10, 2, 7, 4]}, {'text': 'B5FPS1Y5QD', 'xywh': [4, 2, 2, 1]}, {'text': 'VUNRLE9Y7K', 'xywh': [9, 1, 6, 6]}, {'text': 'IZ4XLRTT0G', 'xywh': [4, 1, 7, 0]}, {'text': '0OIA0BKV3K', 'xywh': [5, 2, 9, 8]}, {'text': '8HYATQ9DFN', 'xywh': [0, 10, 8, 8]}, {'text': 'YQ4A2J7HG5', 'xywh': [3, 0, 3, 5]}]}, {'id': 5, 'list': [{'text': '94SAIMPTRW', 'xywh': [5, 6, 8, 8]}, {'text': '5CQ7T02S5N', 'xywh': [7, 6, 7, 6]}, {'text': '51OACCWNVH', 'xywh': [8, 9, 1, 4]}, {'text': 'FBNVBQ91BI', 'xywh': [5, 0, 2, 0]}, {'text': 'DA5LLJMJ0R', 'xywh': [2, 8, 10, 9]}, {'text': 'ZH7T7UEUJK', 'xywh': [10, 6, 3, 6]}, {'text': 'DRX3MSMXKK', 'xywh': [9, 10, 0, 1]}, {'text': '07T15S11IW', 'xywh': [9, 3, 9, 5]}, {'text': 'PCWWF7TF88', 'xywh': [4, 10, 1, 7]}, {'text': 'EOWE3G4CIK', 'xywh': [2, 6, 7, 3]}]}, {'id': 6, 'list': [{'text': 'AG0F83XSYY', 'xywh': [3, 5, 1, 3]}, {'text': 'QDBZGIBITF', 'xywh': [3, 3, 10, 1]}, {'text': 'DUZ4072HJM', 'xywh': [3, 10, 5, 6]}, {'text': 'J4AU0W0612', 'xywh': [2, 6, 6, 6]}, {'text': 'U8T1T9K6HW', 'xywh': [4, 10, 5, 1]}, {'text': 'FC078A5VCL', 'xywh': [9, 4, 1, 1]}, {'text': 'IJYAML9F4C', 'xywh': [2, 9, 5, 0]}, {'text': '8QHX8K2872', 'xywh': [9, 8, 10, 10]}, {'text': 'CZDIQ5YPKD', 'xywh': [7, 4, 4, 2]}, {'text': 'UYBVUK20T5', 'xywh': [5, 0, 5, 0]}]}, {'id': 7, 'list': [{'text': 'C4FBFY5S7H', 'xywh': [3, 4, 7, 2]}, {'text': '3G9CY618CL', 'xywh': [5, 7, 2, 1]}, {'text': '8DJMO7R8W1', 'xywh': [1, 5, 1, 3]}, {'text': 'XNZFVDQAMN', 'xywh': [5, 1, 2, 2]}, {'text': '8VV3A0E5HT', 'xywh': [3, 2, 6, 2]}, {'text': 'U5BVGN8TK1', 'xywh': [1, 6, 1, 9]}, {'text': '87V0U64T0H', 'xywh': [1, 2, 7, 9]}, {'text': 'S8PMKTPG50', 'xywh': [1, 2, 5, 4]}, {'text': 'DXINY9FJP0', 'xywh': [1, 3, 6, 9]}, {'text': '4VHG2VJKV2', 'xywh': [2, 9, 6, 9]}]}, {'id': 8, 'list': [{'text': 'CD9AMIPLWN', 'xywh': [2, 7, 1, 9]}, {'text': '5KQHUK2LRQ', 'xywh': [6, 2, 8, 9]}, {'text': 'UYJ2ASBDGF', 'xywh': [3, 9, 10, 10]}, {'text': '3UHYDKWENL', 'xywh': [3, 8, 8, 4]}, {'text': 'FPBGA0MA0D', 'xywh': [3, 9, 3, 1]}, {'text': '0SLSZA9TIU', 'xywh': [9, 8, 2, 1]}, {'text': '56O8MVARB6', 'xywh': [1, 5, 6, 4]}, {'text': 'JMAD5N8DEU', 'xywh': [5, 6, 6, 4]}, {'text': 'XSILUMSLM2', 'xywh': [10, 3, 4, 3]}, {'text': '8MV5CHORRX', 'xywh': [2, 5, 6, 2]}]}, {'id': 9, 'list': [{'text': 'QMEM059TUQ', 'xywh': [9, 8, 10, 6]}, {'text': '1027VBNYJ5', 'xywh': [2, 7, 5, 6]}, {'text': 'Y5XQ4KX0S7', 'xywh': [2, 3, 3, 3]}, {'text': 'JHCUFM3QFK', 'xywh': [3, 7, 2, 8]}, {'text': 'EKPJS33CA5', 'xywh': [10, 8, 5, 5]}, {'text': 'EYTH8YRM0K', 'xywh': [4, 7, 4, 7]}, {'text': 'TZK4M8ZCRC', 'xywh': [10, 4, 10, 10]}, {'text': 'FW51VCOR9R', 'xywh': [8, 10, 8, 5]}, {'text': 'GI10U2T2E0', 'xywh': [7, 0, 0, 10]}, {'text': 'KY9X6OA9P8', 'xywh': [7, 7, 4, 7]}]}]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# 生成数据, 格式如下:\n",
|
|
"# [{id: 1, list: [{text:'a', xywh:[1,2,3,4]}, {text:'b', xywh:[1,2,3,4]}]}]\n",
|
|
"def create_data(num=100):\n",
|
|
" import random\n",
|
|
" import string\n",
|
|
" data = []\n",
|
|
" for i in range(num):\n",
|
|
" data.append({\n",
|
|
" 'id': i,\n",
|
|
" 'list': [{\n",
|
|
" 'text': ''.join(random.choices(string.ascii_uppercase + string.digits, k=10)),\n",
|
|
" 'xywh': [random.randint(0, num) for _ in range(4)]\n",
|
|
" } for _ in range(10)]\n",
|
|
" })\n",
|
|
" return data\n",
|
|
"\n",
|
|
"dataset = create_data(10)\n",
|
|
"print(dataset)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/satori/ocr/venv/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:397: SecurityWarning: Connecting to 'https://47.102.112.57:9200' using TLS with verify_certs=False is insecure\n",
|
|
" _transport = transport_class(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import warnings\n",
|
|
"from urllib3.exceptions import InsecureRequestWarning\n",
|
|
"warnings.simplefilter('ignore', InsecureRequestWarning)\n",
|
|
"\n",
|
|
"import json\n",
|
|
"from elasticsearch import Elasticsearch\n",
|
|
"es = Elasticsearch(\"https://47.102.112.57:9200\", basic_auth=(\"elastic\", \"password\"), verify_certs=False)\n",
|
|
"es.indices.delete(index='my_index')\n",
|
|
"es.indices.create(index='my_index')\n",
|
|
"\n",
|
|
"dataset = [{'id': 587, 'content': '[{\"text\": \"福利\", \"score\": 0, \"position\": [[61, 21], [90, 21], [90, 38], [61, 38]]}, {\"text\": \"33.3万\", \"score\": 0, \"position\": [[579, 25], [633, 25], [633, 41], [579, 41]]}, {\"text\": \"2875万\", \"score\": 0, \"position\": [[733, 25], [794, 25], [794, 40], [733, 40]]}, {\"text\": \"好看视频\", \"score\": 0, \"position\": [[925, 30], [1091, 25], [1093, 65], [926, 70]]}, {\"text\": \"勇土领取了首次值第三天的奖励,得了经典坐\", \"score\": 0, \"position\": [[352, 82], [710, 82], [710, 102], [352, 102]]}, {\"text\": \"累计到\", \"score\": 0, \"position\": [[842, 74], [908, 70], [909, 92], [843, 95]]}, {\"text\": \"@搞事丸\", \"score\": 0, \"position\": [[985, 79], [1087, 82], [1086, 108], [984, 104]]}, {\"text\": \"最强\", \"score\": 0, \"position\": [[176, 119], [243, 119], [243, 139], [176, 139]]}, {\"text\": \"第1天\", \"score\": 0, \"position\": [[362, 150], [401, 150], [401, 166], [362, 166]]}, {\"text\": \"第2天\", \"score\": 0, \"position\": [[457, 150], [496, 150], [496, 166], [457, 166]]}, {\"text\": \"第3天\", \"score\": 0, \"position\": [[550, 149], [593, 149], [593, 168], [550, 168]]}, {\"text\": \"第4天\", \"score\": 0, \"position\": [[644, 147], [691, 147], [691, 168], [644, 168]]}, {\"text\": \"第5天\", \"score\": 0, \"position\": [[740, 149], [785, 149], [785, 168], [740, 168]]}, {\"text\": \"周礼\", \"score\": 0, \"position\": [[175, 190], [241, 190], [241, 209], [175, 209]]}, {\"text\": \"到有礼\", \"score\": 0, \"position\": [[175, 260], [243, 260], [243, 280], [175, 280]]}, {\"text\": \"第6天\", \"score\": 0, \"position\": [[361, 272], [401, 272], [401, 290], [361, 290]]}, {\"text\": \"第7天\", \"score\": 0, \"position\": [[458, 273], [496, 273], [496, 290], [458, 290]]}, {\"text\": \"第8天\", \"score\": 0, \"position\": [[553, 273], [590, 273], [590, 289], [553, 289]]}, {\"text\": \"第9天\", \"score\": 0, \"position\": [[649, 273], [686, 273], [686, 289], [649, 289]]}, {\"text\": \"第10天\", \"score\": 0, \"position\": [[736, 270], [787, 270], [787, 290], [736, 290]]}, {\"text\": \"升级有礼\", \"score\": 0, \"position\": [[177, 331], [241, 331], [241, 351], [177, 351]]}, {\"text\": \"充值返还\", \"score\": 0, \"position\": [[179, 402], [243, 402], [243, 422], [179, 422]]}, {\"text\": \"第1\", \"score\": 0, \"position\": [[359, 396], [403, 396], [403, 412], [359, 412]]}, {\"text\": \"第12天\", \"score\": 0, \"position\": [[454, 393], [500, 393], [500, 413], [454, 413]]}, {\"text\": \"第13天\", \"score\": 0, \"position\": [[548, 393], [596, 393], [596, 413], [548, 413]]}, {\"text\": \"第14天\", \"score\": 0, \"position\": [[644, 395], [689, 395], [689, 410], [644, 410]]}, {\"text\": \"第15天\", \"score\": 0, \"position\": [[737, 393], [787, 393], [787, 413], [737, 413]]}, {\"text\": \"手机绑定\", \"score\": 0, \"position\": [[177, 473], [241, 473], [241, 492], [177, 492]]}, {\"text\": \"更多福利\", \"score\": 0, \"position\": [[144, 542], [241, 542], [241, 562], [144, 562]]}, {\"text\": \"到3天\", \"score\": 0, \"position\": [[298, 586], [356, 586], [356, 606], [298, 606]]}, {\"text\": \"签到8天\", \"score\": 0, \"position\": [[419, 586], [477, 586], [477, 606], [419, 606]]}, {\"text\": \"签到14天\", \"score\": 0, \"position\": [[535, 586], [601, 586], [601, 606], [535, 606]]}, {\"text\": \"到21天\", \"score\": 0, \"position\": [[656, 586], [721, 586], [721, 606], [656, 606]]}, {\"text\": \"签到28天\", \"score\": 0, \"position\": [[773, 588], [844, 584], [845, 604], [774, 607]]}]'}, {'id': 606, 'content': '[{\"text\": \"Setu Subtites\", \"score\": 0, \"position\": [[100, 96], [313, 100], [312, 135], [99, 131]]}, {\"text\": \"Subtite Mode\", \"score\": 0, \"position\": [[100, 176], [199, 176], [199, 198], [100, 198]]}, {\"text\": \"None\", \"score\": 0, \"position\": [[555, 184], [589, 184], [589, 206], [555, 206]]}, {\"text\": \"Subtitle Size\", \"score\": 0, \"position\": [[110, 263], [199, 263], [199, 284], [110, 284]]}, {\"text\": \"Smalt Defaut)\", \"score\": 0, \"position\": [[526, 267], [624, 267], [624, 288], [526, 288]]}, {\"text\": \"Subtite Languae\", \"score\": 0, \"position\": [[112, 343], [236, 343], [236, 365], [112, 365]]}, {\"text\": \"Francais\", \"score\": 0, \"position\": [[546, 345], [601, 345], [601, 366], [546, 366]]}, {\"text\": \"Subtite Contrast\", \"score\": 0, \"position\": [[112, 427], [230, 427], [230, 449], [112, 449]]}, {\"text\": \"Chiteis\", \"score\": 0, \"position\": [[95, 494], [404, 488], [404, 515], [96, 522]]}, {\"text\": \"Next ECHAP Back\", \"score\": 0, \"position\": [[101, 588], [294, 584], [295, 611], [102, 616]]}]'}, {'id': 767, 'content': '[{\"text\": \"Apple\", \"score\": 0, \"position\": [[826, 272], [958, 276], [957, 320], [825, 316]]}, {\"text\": \"Annihilator\", \"score\": 0, \"position\": [[772, 327], [1016, 330], [1015, 368], [772, 364]]}, {\"text\": \"Dees\", \"score\": 0, \"position\": [[752, 426], [864, 426], [864, 452], [752, 452]]}, {\"text\": \"58/100\", \"score\": 0, \"position\": [[939, 518], [1008, 518], [1008, 551], [939, 551]]}, {\"text\": \"Men\", \"score\": 0, \"position\": [[105, 595], [153, 595], [153, 619], [105, 619]]}]'}, {'id': 766, 'content': '[{\"text\": \"Bluebery Slime\", \"score\": 0, \"position\": [[511, 139], [706, 139], [706, 178], [511, 178]]}, {\"text\": \"Petiteandterifiedofbeingetalon\", \"score\": 0, \"position\": [[617, 205], [877, 205], [877, 226], [617, 226]]}, {\"text\": \"Bueberriesmoveinpacksand sticktotheir\", \"score\": 0, \"position\": [[616, 238], [917, 238], [917, 259], [616, 259]]}, {\"text\": \"peed\", \"score\": 0, \"position\": [[506, 279], [552, 279], [552, 303], [506, 303]]}, {\"text\": \"targets suroundin them and preventinthein\", \"score\": 0, \"position\": [[614, 271], [933, 271], [933, 292], [614, 292]]}]'}, {'id': 774, 'content': '[{\"text\": \"植物学家\", \"score\": 0, \"position\": [[1444, 451], [1538, 451], [1538, 480], [1444, 480]]}, {\"text\": \"阿尔法\", \"score\": 0, \"position\": [[1253, 494], [1532, 494], [1532, 578], [1253, 578]]}, {\"text\": \"可复活一次\", \"score\": 0, \"position\": [[1367, 706], [1487, 706], [1487, 739], [1367, 739]]}, {\"text\": \"技能:召唤豌豆射手\", \"score\": 0, \"position\": [[1280, 743], [1481, 743], [1481, 770], [1280, 770]]}, {\"text\": \"转到设置\", \"score\": 0, \"position\": [[1328, 790], [1381, 790], [1381, 804], [1328, 804]]}]'}]\n",
|
|
"\n",
|
|
"# 生成索引\n",
|
|
"for data in dataset:\n",
|
|
" content = json.loads(data[\"content\"])\n",
|
|
" texts = []\n",
|
|
" for item in content:\n",
|
|
" texts.append(item[\"text\"])\n",
|
|
" doc_id = f\"{data['id']}\"\n",
|
|
" es.index(index=\"my_index\", id=doc_id, body={\"content\": ' '.join(texts)})\n",
|
|
"\n",
|
|
"# 生成索引\n",
|
|
"#for data in dataset:\n",
|
|
"# for item in data[\"list\"]:\n",
|
|
"# doc_id = f\"{data['id']}_{item['text']}\"\n",
|
|
"# es.index(index=\"my_index\", id=doc_id, body=item)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},\n",
|
|
" 'hits': {'hits': [{'_id': '774',\n",
|
|
" '_index': 'my_index',\n",
|
|
" '_score': 3.2047122,\n",
|
|
" '_source': {'content': '植物学家 阿尔法 可复活一次 技能:召唤豌豆射手 转到设置'}},\n",
|
|
" {'_id': '587',\n",
|
|
" '_index': 'my_index',\n",
|
|
" '_score': 2.2945156,\n",
|
|
" '_source': {'content': '福利 33.3万 2875万 好看视频 '\n",
|
|
" '勇土领取了首次值第三天的奖励,得了经典坐 累计到 @搞事丸 最强 '\n",
|
|
" '第1天 第2天 第3天 第4天 第5天 周礼 到有礼 第6天 第7天 '\n",
|
|
" '第8天 第9天 第10天 升级有礼 充值返还 第1 第12天 '\n",
|
|
" '第13天 第14天 第15天 手机绑定 更多福利 到3天 签到8天 '\n",
|
|
" '签到14天 到21天 签到28天'}}],\n",
|
|
" 'max_score': 3.2047122,\n",
|
|
" 'total': {'relation': 'eq', 'value': 2}},\n",
|
|
" 'timed_out': False,\n",
|
|
" 'took': 0}\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/satori/ocr/venv/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:397: SecurityWarning: Connecting to 'https://47.102.112.57:9200' using TLS with verify_certs=False is insecure\n",
|
|
" _transport = transport_class(\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import warnings\n",
|
|
"from pprint import pprint\n",
|
|
"from urllib3.exceptions import InsecureRequestWarning\n",
|
|
"warnings.simplefilter('ignore', InsecureRequestWarning)\n",
|
|
"\n",
|
|
"from elasticsearch import Elasticsearch\n",
|
|
"es = Elasticsearch(\"https://47.102.112.57:9200\", basic_auth=(\"elastic\", \"password\"), verify_certs=False)\n",
|
|
"\n",
|
|
"# 搜索\n",
|
|
"res = es.search(index=\"my_index\", body={\"query\": {\"match\": {\"content\": \"豌豆 福利\"}}})\n",
|
|
"pprint(res.body)\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|