69 lines
1.6 KiB
Python
69 lines
1.6 KiB
Python
# python 實現圖片文字辨識(ocr)
|
|
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
# 設置tesseract執行文件的路徑(linux)
|
|
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
|
|
|
|
# 設置提取中文和英文
|
|
|
|
|
|
# 讀取圖片
|
|
image = Image.open('data/tt.jpeg')
|
|
|
|
# 圖片轉為灰度圖
|
|
image = image.convert('L')
|
|
|
|
# 圖片二值化
|
|
threshold = 127
|
|
table = []
|
|
for i in range(256):
|
|
if i < threshold:
|
|
table.append(0)
|
|
else:
|
|
table.append(1)
|
|
image = image.point(table, '1')
|
|
|
|
# 圖片轉為字符串
|
|
#text = pytesseract.image_to_string(image, lang='chi_sim+eng')
|
|
#print(text)
|
|
#
|
|
#data = pytesseract.image_to_boxes(image, lang='chi_sim+eng')
|
|
#print(data)
|
|
|
|
# 座標(全數據)
|
|
#data = pytesseract.image_to_data(image, lang='chi_sim+eng')
|
|
#print(data)
|
|
|
|
data = pytesseract.image_to_alto_xml(image, lang='chi_sim+eng')
|
|
#print(data)
|
|
|
|
## 打印格式化的XML
|
|
#from xml.dom.minidom import parseString
|
|
#dom = parseString(data)
|
|
#print(dom.toprettyxml())
|
|
|
|
# 打印格式化的JSON
|
|
import json
|
|
from xmljson import badgerfish as bf
|
|
from xml.etree.ElementTree import fromstring
|
|
|
|
# 過濾掉@CONTENT爲空的數據和爲數字的數據
|
|
def filter_data(data):
|
|
if isinstance(data, dict):
|
|
if '@CONTENT' in data:
|
|
if data['@CONTENT'] == '' or data['@CONTENT'].isdigit():
|
|
return None
|
|
for k, v in data.items():
|
|
if isinstance(v, dict):
|
|
data[k] = filter_data(v)
|
|
elif isinstance(v, list):
|
|
data[k] = [filter_data(i) for i in v]
|
|
return data
|
|
|
|
xml = fromstring(data)
|
|
json = json.dumps(bf.data(xml), indent=4, ensure_ascii=False)
|
|
print(json)
|
|
|