# python 實現圖片文字辨識(ocr) import pytesseract from PIL import Image # 設置tesseract執行文件的路徑(linux) pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract' # 設置提取中文和英文 # 讀取圖片 image = Image.open('data/tt.jpeg') # 圖片轉為灰度圖 image = image.convert('L') # 圖片二值化 threshold = 127 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) image = image.point(table, '1') # 圖片轉為字符串 #text = pytesseract.image_to_string(image, lang='chi_sim+eng') #print(text) # #data = pytesseract.image_to_boxes(image, lang='chi_sim+eng') #print(data) # 座標(全數據) #data = pytesseract.image_to_data(image, lang='chi_sim+eng') #print(data) data = pytesseract.image_to_alto_xml(image, lang='chi_sim+eng') #print(data) ## 打印格式化的XML #from xml.dom.minidom import parseString #dom = parseString(data) #print(dom.toprettyxml()) # 打印格式化的JSON import json from xmljson import badgerfish as bf from xml.etree.ElementTree import fromstring # 過濾掉@CONTENT爲空的數據和爲數字的數據 def filter_data(data): if isinstance(data, dict): if '@CONTENT' in data: if data['@CONTENT'] == '' or data['@CONTENT'].isdigit(): return None for k, v in data.items(): if isinstance(v, dict): data[k] = filter_data(v) elif isinstance(v, list): data[k] = [filter_data(i) for i in v] return data xml = fromstring(data) json = json.dumps(bf.data(xml), indent=4, ensure_ascii=False) print(json)