inula/packages/inula-code-generator/inula-code-generator-web/backend/api/component/model.py

import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
from .detectImg import generate_detect_img
from api.component.ultralytics import YOLOv10


ocr_model = PaddleOCR(use_angle_cls=True, use_gpu=False, lang="ch")


def extractTextWithPositions(image):
    result = ocr_model.ocr(image, cls=True)
    textComponents = []

    for line in result[0]:
        text_region = line[0]
        text = line[1][0]
        confidence = line[1][1]

        if isinstance(text, str) and text.strip():
            x1, y1 = map(int, text_region[0])
            x2, y2 = map(int, text_region[2])

            textComponents.append(
                {"text": text, "confidence": confidence, "bbox": [x1, y1, x2, y2]}
            )

    return textComponents


def componentDetect(
    imgURL,
    modelPath=os.path.join("api", "component", "model.pt"),
    confidenceThreshold=0.15,
):
    model = YOLOv10(modelPath)

    img = cv2.imread(imgURL)
    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    textComponents = extractTextWithPositions(imgRGB)

    results = model(imgRGB, conf=confidenceThreshold)
    detections = results[0].boxes.data
    height, width, _ = img.shape
    detectionResult = []

    componentID = 1
    for detection in detections:
        x1, y1, x2, y2, conf, cls = detection[:6]
        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)

        componentType = model.names[int(cls)]

        if componentType.lower() == "text":
            continue

        componentROI = imgRGB[y1:y2, x1:x2]

        avg_color_per_row = np.average(componentROI, axis=0)
        avg_color = np.average(avg_color_per_row, axis=0)
        color = [int(c) for c in avg_color]

        position = {
            "x": x1 / width,
            "y": y1 / height,
            "width": (x2 - x1) / width,
            "height": (y2 - y1) / height,
        }

        componentData = {
            "id": componentID,
            "componentType": componentType,
            "position": position,
            "color": {"r": color[0], "g": color[1], "b": color[2]},
        }

        detectionResult.append(componentData)
        componentID += 1

    for textComp in textComponents:
        x1, y1, x2, y2 = textComp["bbox"]
        # text_region = img[y1:y2, x1:x2]
        # avg_color = cv2.mean(text_region)[:3]  # BGR format
        position = {
            "x": x1 / width,
            "y": y1 / height,
            "width": (x2 - x1) / width,
            "height": (y2 - y1) / height,
        }
        detectionResult.append(
            {
                "id": componentID,
                "componentType": "Text",
                "position": position,
                "text": textComp["text"],
                # "color": {"r": int(avg_color[2]), "g": int(avg_color[1]), "b": int(avg_color[0])}  # Convert to RGB
            }
        )
        componentID += 1

    detectImg = generate_detect_img(img, detectionResult)

    return detectionResult, detectImg