Files
webp/bin/ocr.py
2023-04-20 17:31:09 +08:00

69 lines
1.6 KiB
Python

# python 實現圖片文字辨識(ocr)
import pytesseract
from PIL import Image
# 設置tesseract執行文件的路徑(linux)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
# 設置提取中文和英文
# 讀取圖片
image = Image.open('data/tt.jpeg')
# 圖片轉為灰度圖
image = image.convert('L')
# 圖片二值化
threshold = 127
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
image = image.point(table, '1')
# 圖片轉為字符串
#text = pytesseract.image_to_string(image, lang='chi_sim+eng')
#print(text)
#
#data = pytesseract.image_to_boxes(image, lang='chi_sim+eng')
#print(data)
# 座標(全數據)
#data = pytesseract.image_to_data(image, lang='chi_sim+eng')
#print(data)
data = pytesseract.image_to_alto_xml(image, lang='chi_sim+eng')
#print(data)
## 打印格式化的XML
#from xml.dom.minidom import parseString
#dom = parseString(data)
#print(dom.toprettyxml())
# 打印格式化的JSON
import json
from xmljson import badgerfish as bf
from xml.etree.ElementTree import fromstring
# 過濾掉@CONTENT爲空的數據和爲數字的數據
def filter_data(data):
if isinstance(data, dict):
if '@CONTENT' in data:
if data['@CONTENT'] == '' or data['@CONTENT'].isdigit():
return None
for k, v in data.items():
if isinstance(v, dict):
data[k] = filter_data(v)
elif isinstance(v, list):
data[k] = [filter_data(i) for i in v]
return data
xml = fromstring(data)
json = json.dumps(bf.data(xml), indent=4, ensure_ascii=False)
print(json)