inula/packages/inula-code-generator/inula-code-generator-web/backend/api/component/model.py

106 lines
2.9 KiB
Python

import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
from .detectImg import generate_detect_img
from api.component.ultralytics import YOLOv10
ocr_model = PaddleOCR(use_angle_cls=True, use_gpu=False, lang="ch")
def extractTextWithPositions(image):
result = ocr_model.ocr(image, cls=True)
textComponents = []
for line in result[0]:
text_region = line[0]
text = line[1][0]
confidence = line[1][1]
if isinstance(text, str) and text.strip():
x1, y1 = map(int, text_region[0])
x2, y2 = map(int, text_region[2])
textComponents.append(
{"text": text, "confidence": confidence, "bbox": [x1, y1, x2, y2]}
)
return textComponents
def componentDetect(
imgURL,
modelPath=os.path.join("api", "component", "model.pt"),
confidenceThreshold=0.15,
):
model = YOLOv10(modelPath)
img = cv2.imread(imgURL)
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
textComponents = extractTextWithPositions(imgRGB)
results = model(imgRGB, conf=confidenceThreshold)
detections = results[0].boxes.data
height, width, _ = img.shape
detectionResult = []
componentID = 1
for detection in detections:
x1, y1, x2, y2, conf, cls = detection[:6]
x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
componentType = model.names[int(cls)]
if componentType.lower() == "text":
continue
componentROI = imgRGB[y1:y2, x1:x2]
avg_color_per_row = np.average(componentROI, axis=0)
avg_color = np.average(avg_color_per_row, axis=0)
color = [int(c) for c in avg_color]
position = {
"x": x1 / width,
"y": y1 / height,
"width": (x2 - x1) / width,
"height": (y2 - y1) / height,
}
componentData = {
"id": componentID,
"componentType": componentType,
"position": position,
"color": {"r": color[0], "g": color[1], "b": color[2]},
}
detectionResult.append(componentData)
componentID += 1
for textComp in textComponents:
x1, y1, x2, y2 = textComp["bbox"]
# text_region = img[y1:y2, x1:x2]
# avg_color = cv2.mean(text_region)[:3] # BGR format
position = {
"x": x1 / width,
"y": y1 / height,
"width": (x2 - x1) / width,
"height": (y2 - y1) / height,
}
detectionResult.append(
{
"id": componentID,
"componentType": "Text",
"position": position,
"text": textComp["text"],
# "color": {"r": int(avg_color[2]), "g": int(avg_color[1]), "b": int(avg_color[0])} # Convert to RGB
}
)
componentID += 1
detectImg = generate_detect_img(img, detectionResult)
return detectionResult, detectImg