dhanush342 · dhanush342 · Mar 18, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -0,0 +1,4 @@
+
+## 2024-05-20 - Non-blocking asyncio Inference Calls
+**Learning:** Calling blocking synchronous code in an asyncio event loop halts all other concurrent tasks. In FastAPI applications, running ML inference or heavy processing functions sequentially on the main thread severely limits throughput and creates bottlenecks.
+**Action:** Always wrap heavy synchronous functions (like `inference.generate_solution`) in `await asyncio.to_thread(...)` to dispatch them to background threads, preserving event loop concurrency for incoming requests.
diff --git a/web/__pycache__/app.cpython-312.pyc b/web/__pycache__/app.cpython-312.pyc
diff --git a/web/app.py b/web/app.py
@@ -3,6 +3,8 @@
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from typing import Optional
+import asyncio
+import logging
 
 import inference
 
@@ -40,14 +42,16 @@ async def solve(req: SolveRequest):
         raise HTTPException(status_code=400, detail="`problem` must be a non-empty string")
 
     try:
-        solution = inference.generate_solution(
+        solution = await asyncio.to_thread(
+            inference.generate_solution,
             problem=req.problem,
             cot=req.cot,
             temperature=req.temperature,
             top_p=req.top_p,
             max_new_tokens=req.max_new_tokens,
         )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
+    except Exception:
+        logging.exception("Error during inference")
+        raise HTTPException(status_code=500, detail="Internal server error")
 
     return SolveResponse(solution=solution)