diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..90c9292 --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,4 @@ + +## 2024-05-20 - Non-blocking asyncio Inference Calls +**Learning:** Calling blocking synchronous code in an asyncio event loop halts all other concurrent tasks. In FastAPI applications, running ML inference or heavy processing functions sequentially on the main thread severely limits throughput and creates bottlenecks. +**Action:** Always wrap heavy synchronous functions (like `inference.generate_solution`) in `await asyncio.to_thread(...)` to dispatch them to background threads, preserving event loop concurrency for incoming requests. diff --git a/web/__pycache__/app.cpython-312.pyc b/web/__pycache__/app.cpython-312.pyc new file mode 100644 index 0000000..1077fd3 Binary files /dev/null and b/web/__pycache__/app.cpython-312.pyc differ diff --git a/web/app.py b/web/app.py index 62b21b8..21f0297 100644 --- a/web/app.py +++ b/web/app.py @@ -3,6 +3,8 @@ from fastapi.responses import FileResponse from pydantic import BaseModel from typing import Optional +import asyncio +import logging import inference @@ -40,14 +42,16 @@ async def solve(req: SolveRequest): raise HTTPException(status_code=400, detail="`problem` must be a non-empty string") try: - solution = inference.generate_solution( + solution = await asyncio.to_thread( + inference.generate_solution, problem=req.problem, cot=req.cot, temperature=req.temperature, top_p=req.top_p, max_new_tokens=req.max_new_tokens, ) - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) + except Exception: + logging.exception("Error during inference") + raise HTTPException(status_code=500, detail="Internal server error") return SolveResponse(solution=solution)