chore(deps): update machine-learning (#6302)

* chore(deps): update machine-learning * fix typing, use new lifespan syntax * wrap in try / finally * move log --------- Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: mertalev <101130780+mertalev@users.noreply.github.com>
2025-08-11 19:29:00 +00:00 · 2024-01-13 05:00:09 +00:00
parent bd5ae9f31e
commit 20be42cec0
12 changed files with 238 additions and 212 deletions
--- a/machine-learning/app/conftest.py
+++ b/machine-learning/app/conftest.py
@@ -5,10 +5,10 @@ from unittest import mock
 import numpy as np
 import pytest
 from fastapi.testclient import TestClient
+from numpy.typing import NDArray
 from PIL import Image

 from .main import app
-from .schemas import ndarray_f32


@pytest.fixture
@@ -17,7 +17,7 @@ def pil_image() -> Image.Image:


@pytest.fixture
-def cv_image(pil_image: Image.Image) -> ndarray_f32:
+def cv_image(pil_image: Image.Image) -> NDArray[np.float32]:
    return np.asarray(pil_image)[:, :, ::-1]  # PIL uses RGB while cv2 uses BGR


--- a/machine-learning/app/main.py
+++ b/machine-learning/app/main.py
@@ -2,11 +2,11 @@ import asyncio
 import gc
 import os
 import signal
-import sys
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Iterator
+from contextlib import asynccontextmanager
+from typing import Any, AsyncGenerator, Iterator
 from zipfile import BadZipFile

 import orjson
@@ -26,7 +26,6 @@ from .schemas import (
 )

 MultiPartParser.max_file_size = 2**26  # spools to disk if payload is 64 MiB or larger
-app = FastAPI()

 model_cache = ModelCache(ttl=settings.model_ttl, revalidate=settings.model_ttl > 0)
 thread_pool: ThreadPoolExecutor | None = None
@@ -35,8 +34,8 @@ active_requests = 0
 last_called: float | None = None


-@app.on_event("startup")
-def startup() -> None:
+@asynccontextmanager
+async def lifespan(_: FastAPI) -> AsyncGenerator[None, None]:
    global thread_pool
    log.info(
        (
@@ -44,21 +43,22 @@ def startup() -> None:
            f"{f'after {settings.model_ttl}s of inactivity' if settings.model_ttl > 0 else 'disabled'}."
        )
    )
-    # asyncio is a huge bottleneck for performance, so we use a thread pool to run blocking code
-    thread_pool = ThreadPoolExecutor(settings.request_threads) if settings.request_threads > 0 else None
-    if settings.model_ttl > 0 and settings.model_ttl_poll_s > 0:
-        asyncio.ensure_future(idle_shutdown_task())
-    log.info(f"Initialized request thread pool with {settings.request_threads} threads.")
-
-
-@app.on_event("shutdown")
-def shutdown() -> None:
-    log.handlers.clear()
-    for model in model_cache.cache._cache.values():
-        del model
-    if thread_pool is not None:
-        thread_pool.shutdown()
-    gc.collect()
+    
+    try:
+        if settings.request_threads > 0:
+            # asyncio is a huge bottleneck for performance, so we use a thread pool to run blocking code
+            thread_pool = ThreadPoolExecutor(settings.request_threads) if settings.request_threads > 0 else None
+            log.info(f"Initialized request thread pool with {settings.request_threads} threads.")
+        if settings.model_ttl > 0 and settings.model_ttl_poll_s > 0:
+            asyncio.ensure_future(idle_shutdown_task())
+        yield
+    finally:
+        log.handlers.clear()
+        for model in model_cache.cache._cache.values():
+            del model
+        if thread_pool is not None:
+            thread_pool.shutdown()
+        gc.collect()


 def update_state() -> Iterator[None]:
@@ -71,6 +71,9 @@ def update_state() -> Iterator[None]:
        active_requests -= 1


+app = FastAPI(lifespan=lifespan)
+
+
@app.get("/", response_model=MessageResponse)
 async def root() -> dict[str, str]:
    return {"message": "Immich ML"}
--- a/machine-learning/app/models/ann.py
+++ b/machine-learning/app/models/ann.py
@@ -3,10 +3,10 @@ from __future__ import annotations
 from pathlib import Path
 from typing import Any, NamedTuple

-from numpy import ascontiguousarray
+import numpy as np
+from numpy.typing import NDArray

 from ann.ann import Ann
-from app.schemas import ndarray_f32, ndarray_i32

 from ..config import log, settings

@@ -56,10 +56,10 @@ class AnnSession:
    def run(
        self,
        output_names: list[str] | None,
-        input_feed: dict[str, ndarray_f32] | dict[str, ndarray_i32],
+        input_feed: dict[str, NDArray[np.float32]] | dict[str, NDArray[np.int32]],
        run_options: Any = None,
-    ) -> list[ndarray_f32]:
-        inputs: list[ndarray_f32] = [ascontiguousarray(v) for v in input_feed.values()]
+    ) -> list[NDArray[np.float32]]:
+        inputs: list[NDArray[np.float32]] = [np.ascontiguousarray(v) for v in input_feed.values()]
        return self.ann.execute(self.model, inputs)


--- a/machine-learning/app/models/clip.py
+++ b/machine-learning/app/models/clip.py
@@ -6,12 +6,13 @@ from pathlib import Path
 from typing import Any, Literal

 import numpy as np
+from numpy.typing import NDArray
 from PIL import Image
 from tokenizers import Encoding, Tokenizer

 from app.config import clean_name, log
 from app.models.transforms import crop, get_pil_resampling, normalize, resize, to_numpy
-from app.schemas import ModelType, ndarray_f32, ndarray_i32
+from app.schemas import ModelType

 from .base import InferenceModel

@@ -40,7 +41,7 @@ class BaseCLIPEncoder(InferenceModel):
            self.vision_model = self._make_session(self.visual_path)
            log.debug(f"Loaded clip vision model '{self.model_name}'")

-    def _predict(self, image_or_text: Image.Image | str) -> ndarray_f32:
+    def _predict(self, image_or_text: Image.Image | str) -> NDArray[np.float32]:
        if isinstance(image_or_text, bytes):
            image_or_text = Image.open(BytesIO(image_or_text))

@@ -48,7 +49,7 @@ class BaseCLIPEncoder(InferenceModel):
            case Image.Image():
                if self.mode == "text":
                    raise TypeError("Cannot encode image as text-only model")
-                outputs: ndarray_f32 = self.vision_model.run(None, self.transform(image_or_text))[0][0]
+                outputs: NDArray[np.float32] = self.vision_model.run(None, self.transform(image_or_text))[0][0]
            case str():
                if self.mode == "vision":
                    raise TypeError("Cannot encode text as vision-only model")
@@ -59,11 +60,11 @@ class BaseCLIPEncoder(InferenceModel):
        return outputs

    @abstractmethod
-    def tokenize(self, text: str) -> dict[str, ndarray_i32]:
+    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
        pass

    @abstractmethod
-    def transform(self, image: Image.Image) -> dict[str, ndarray_f32]:
+    def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
        pass

    @property
@@ -161,11 +162,11 @@ class OpenCLIPEncoder(BaseCLIPEncoder):
        self.tokenizer.enable_truncation(max_length=context_length)
        log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'")

-    def tokenize(self, text: str) -> dict[str, ndarray_i32]:
+    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
        tokens: Encoding = self.tokenizer.encode(text)
        return {"text": np.array([tokens.ids], dtype=np.int32)}

-    def transform(self, image: Image.Image) -> dict[str, ndarray_f32]:
+    def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]:
        image = resize(image, self.size)
        image = crop(image, self.size)
        image_np = to_numpy(image)
@@ -174,7 +175,7 @@ class OpenCLIPEncoder(BaseCLIPEncoder):


 class MCLIPEncoder(OpenCLIPEncoder):
-    def tokenize(self, text: str) -> dict[str, ndarray_i32]:
+    def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]:
        tokens: Encoding = self.tokenizer.encode(text)
        return {
            "input_ids": np.array([tokens.ids], dtype=np.int32),
--- a/machine-learning/app/models/facial_recognition.py
+++ b/machine-learning/app/models/facial_recognition.py
@@ -5,9 +5,10 @@ import cv2
 import numpy as np
 from insightface.model_zoo import ArcFaceONNX, RetinaFace
 from insightface.utils.face_align import norm_crop
+from numpy.typing import NDArray

 from app.config import clean_name
-from app.schemas import BoundingBox, Face, ModelType, ndarray_f32
+from app.schemas import Face, ModelType, is_ndarray

 from .base import InferenceModel

@@ -36,22 +37,25 @@ class FaceRecognizer(InferenceModel):
        )
        self.rec_model.prepare(ctx_id=0)

-    def _predict(self, image: ndarray_f32 | bytes) -> list[Face]:
+    def _predict(self, image: NDArray[np.uint8] | bytes) -> list[Face]:
        if isinstance(image, bytes):
-            image = cv2.imdecode(np.frombuffer(image, np.uint8), cv2.IMREAD_COLOR)
-        bboxes, kpss = self.det_model.detect(image)
+            decoded_image = cv2.imdecode(np.frombuffer(image, np.uint8), cv2.IMREAD_COLOR)
+        else:
+            decoded_image = image
+        assert is_ndarray(decoded_image, np.uint8)
+        bboxes, kpss = self.det_model.detect(decoded_image)
        if bboxes.size == 0:
            return []
-        assert isinstance(image, np.ndarray) and isinstance(kpss, np.ndarray)
+        assert is_ndarray(kpss, np.float32)

        scores = bboxes[:, 4].tolist()
        bboxes = bboxes[:, :4].round().tolist()

        results = []
-        height, width, _ = image.shape
+        height, width, _ = decoded_image.shape
        for (x1, y1, x2, y2), score, kps in zip(bboxes, scores, kpss):
-            cropped_img = norm_crop(image, kps)
-            embedding: ndarray_f32 = self.rec_model.get_feat(cropped_img)[0]
+            cropped_img = norm_crop(decoded_image, kps)
+            embedding: NDArray[np.float32] = self.rec_model.get_feat(cropped_img)[0]
            face: Face = {
                "imageWidth": width,
                "imageHeight": height,
--- a/machine-learning/app/models/transforms.py
+++ b/machine-learning/app/models/transforms.py
@@ -1,8 +1,7 @@
 import numpy as np
+from numpy.typing import NDArray
 from PIL import Image

-from app.schemas import ndarray_f32
-
 _PIL_RESAMPLING_METHODS = {resampling.name.lower(): resampling for resampling in Image.Resampling}


@@ -23,11 +22,13 @@ def crop(img: Image.Image, size: int) -> Image.Image:
    return img.crop((left, upper, right, lower))


-def to_numpy(img: Image.Image) -> ndarray_f32:
+def to_numpy(img: Image.Image) -> NDArray[np.float32]:
    return np.asarray(img.convert("RGB")).astype(np.float32) / 255.0


-def normalize(img: ndarray_f32, mean: float | ndarray_f32, std: float | ndarray_f32) -> ndarray_f32:
+def normalize(
+    img: NDArray[np.float32], mean: float | NDArray[np.float32], std: float | NDArray[np.float32]
+) -> NDArray[np.float32]:
    return (img - mean) / std


--- a/machine-learning/app/schemas.py
+++ b/machine-learning/app/schemas.py
@@ -1,13 +1,10 @@
 from enum import StrEnum
-from typing import Any, Protocol, TypeAlias, TypedDict, TypeGuard
+from typing import Any, Protocol, TypedDict, TypeGuard

 import numpy as np
+import numpy.typing as npt
 from pydantic import BaseModel

-ndarray_f32: TypeAlias = np.ndarray[int, np.dtype[np.float32]]
-ndarray_i64: TypeAlias = np.ndarray[int, np.dtype[np.int64]]
-ndarray_i32: TypeAlias = np.ndarray[int, np.dtype[np.int32]]
-

 class TextResponse(BaseModel):
    __root__: str
@@ -35,7 +32,7 @@ class HasProfiling(Protocol):

 class Face(TypedDict):
    boundingBox: BoundingBox
-    embedding: ndarray_f32
+    embedding: npt.NDArray[np.float32]
    imageWidth: int
    imageHeight: int
    score: float
@@ -43,3 +40,7 @@ class Face(TypedDict):

 def has_profiling(obj: Any) -> TypeGuard[HasProfiling]:
    return hasattr(obj, "profiling") and isinstance(obj.profiling, dict)
+
+
+def is_ndarray(obj: Any, dtype: "type[np._DTypeScalar_co]") -> "TypeGuard[npt.NDArray[np._DTypeScalar_co]]":
+    return isinstance(obj, np.ndarray) and obj.dtype == dtype