同步
This commit is contained in:
		
							
								
								
									
										6
									
								
								main.py
									
									
									
									
									
								
							
							
						
						
									
										6
									
								
								main.py
									
									
									
									
									
								
							@@ -55,9 +55,9 @@ def process_images(conn, ocr, es):
 | 
				
			|||||||
            item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
 | 
					            item = [x for x in ocr.ocr(image) if x['text'] and not x['text'].isdigit() and len(x['text']) > 1]
 | 
				
			||||||
            text = ' '.join([x['text'] for x in item])
 | 
					            text = ' '.join([x['text'] for x in item])
 | 
				
			||||||
            print(id, text)
 | 
					            print(id, text)
 | 
				
			||||||
            save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
 | 
					            #save_text(conn, id, json.dumps(item, ensure_ascii=False, cls=MyEncoder))
 | 
				
			||||||
            es.index(index='web_images', id=id, body={'content': text})
 | 
					            #es.index(index='web_images', id=id, body={'content': text})
 | 
				
			||||||
        conn.commit()
 | 
					        #conn.commit()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main():
 | 
					def main():
 | 
				
			||||||
    es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
 | 
					    es = Elasticsearch(config['ELASTICSEARCH_HOST'], basic_auth=(config['ELASTICSEARCH_USERNAME'], config['ELASTICSEARCH_PASSWORD']), verify_certs=False)
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										12
									
								
								pp.py
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								pp.py
									
									
									
									
									
								
							@@ -7,11 +7,14 @@ import pymysql
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					import logging
 | 
				
			||||||
from PIL import Image, ImageFile
 | 
					from PIL import Image, ImageFile
 | 
				
			||||||
from dotenv import dotenv_values
 | 
					from dotenv import dotenv_values
 | 
				
			||||||
from elasticsearch import Elasticsearch
 | 
					from elasticsearch import Elasticsearch
 | 
				
			||||||
from paddleocr import PaddleOCR
 | 
					from paddleocr import PaddleOCR
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logging.disable(logging.DEBUG)    # 关闭DEBUG日志的打印
 | 
				
			||||||
 | 
					logging.disable(logging.WARNING)  # 关闭WARNING日志的打印
 | 
				
			||||||
warnings.filterwarnings("ignore")
 | 
					warnings.filterwarnings("ignore")
 | 
				
			||||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
 | 
					ImageFile.LOAD_TRUNCATED_IMAGES = True
 | 
				
			||||||
config = dotenv_values(".env")
 | 
					config = dotenv_values(".env")
 | 
				
			||||||
@@ -34,8 +37,8 @@ def download_image(url:str) -> Image.Image:
 | 
				
			|||||||
        else:
 | 
					        else:
 | 
				
			||||||
            response = requests.get(url)
 | 
					            response = requests.get(url)
 | 
				
			||||||
            return Image.open(io.BytesIO(response.content))
 | 
					            return Image.open(io.BytesIO(response.content))
 | 
				
			||||||
    except Exception:
 | 
					    except Exception as e:
 | 
				
			||||||
        print(f'图片从{url}下载失败')
 | 
					        print(f'图片从{url}下载失败,错误信息为:{e}')
 | 
				
			||||||
        return None
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def connect_to_mysql():
 | 
					def connect_to_mysql():
 | 
				
			||||||
@@ -51,11 +54,14 @@ KR = PaddleOCR(use_angle_cls=True, lang="korean")
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def process_images(conn, es):
 | 
					def process_images(conn, es):
 | 
				
			||||||
    with conn.cursor(pymysql.cursors.SSCursor) as cursor:
 | 
					    with conn.cursor(pymysql.cursors.SSCursor) as cursor:
 | 
				
			||||||
        cursor.execute("SELECT id, content FROM web_images WHERE text='' LIMIT 10")
 | 
					        cursor.execute("SELECT id, content FROM web_images WHERE text!='' LIMIT 10")
 | 
				
			||||||
        for id, content in cursor.fetchall():
 | 
					        for id, content in cursor.fetchall():
 | 
				
			||||||
            image = download_image(content)
 | 
					            image = download_image(content)
 | 
				
			||||||
            if image is None:
 | 
					            if image is None:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
					            if isinstance(image, Image.Image):
 | 
				
			||||||
 | 
					                image = np.array(image)
 | 
				
			||||||
 | 
					            print('---------------------', id, content)
 | 
				
			||||||
            print(CH.ocr(image, cls=True))
 | 
					            print(CH.ocr(image, cls=True))
 | 
				
			||||||
            print(JP.ocr(image, cls=True))
 | 
					            print(JP.ocr(image, cls=True))
 | 
				
			||||||
            print(KR.ocr(image, cls=True))
 | 
					            print(KR.ocr(image, cls=True))
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user