OpenAgentsInc · gonzafirewall · Dec 15, 2023 · Dec 15, 2023 · Dec 16, 2023 · Dec 17, 2023
diff --git a/ai_worker/main.py b/ai_worker/main.py
@@ -20,6 +20,7 @@
 from httpx_sse import aconnect_sse
 from llama_cpp.server.app import Settings as LlamaSettings, create_app as create_llama_app
 import llama_cpp.server.app
+import whisper
 from pydantic import BaseModel, Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pynvml.smi import nvidia_smi
@@ -153,9 +154,11 @@ def __init__(self, conf: Config):
             self.slug = ""
         self.stopped = False
         self.llama = None
+        self.whisper = None
         self.llama_model = None
+        self.whisper_model = None
         self.llama_cli: Optional[AsyncClient] = None
-
+        
         if FineTuner:
             self.fine_tuner = FineTuner(self.conf)
         else:
@@ -270,6 +273,18 @@ def clear_llama_model(self):
         self.llama_cli = None
         self.llama_model = None
 
+    def clear_whisper_model(self):
+        self.whisper = None
+        self.whisper_model = None
+
+    async def load_whisper_model(self, name):
+        assert name, "No model name"
+        if name == self.whisper_model:
+            return
+        log.debug("loading model: %s", name)
+
+        self.whisper = whisper.load_model(name)
+        self.whisper_model = name
 
     async def load_model(self, name):
         assert name, "No model name"
@@ -295,13 +310,15 @@ async def load_model(self, name):
         assert self.llama, "Load llama failed.   Try lowering layers."
         self.llama_cli = AsyncClient(app=self.llama, base_url="http://test")
         self.llama_model = name
+
 
     def _get_connect_info(self) -> ConnectMessage:
         disk_space = get_free_space_mb(".")
 
         caps = []
 
         caps += ['llama-infer']
+        caps += ["whisper"]
 
         if self.fine_tuner:
             caps += ["llama-fine-tune"]
@@ -412,14 +429,26 @@ async def run_ws(self):
                     await asyncio.sleep(1)
                     self.stopped = True
 
+    async def download_tmp_file(self, url: str) -> str:
+        import urllib.parse
+        res = urllib.parse.urlparse(self.conf.queen_url)
+        scheme = "https" if res.scheme == "wss" else "http"
+        async with AsyncClient() as cli:
+            async with cli.stream('GET', f"{scheme}://{res.netloc}/storage/?filename={url}") as response:
+                with tempfile.NamedTemporaryFile("wb", delete=False) as download_file:
+                    async for chunk in response.aiter_bytes():
+                        download_file.write(chunk)
+                    download_file.close()
+                return download_file.name
+
+
     async def run_one(self):
         event = None
         req_str = None
         try:
             req_str = await self.ws_recv()
             req = Req.model_validate_json(req_str)
             model = req.openai_req.get("model")
-
             log.debug("loading %s", model)
 
             st = time.monotonic()
@@ -435,6 +464,13 @@ async def run_one(self):
             elif req.openai_url == "/v1/embeddings" and model.startswith(MODEL_PREFIX):
                 res = self.fast_embed.embed(req.openai_req)
                 await self.ws_send(json.dumps(res), True)
+            elif req.openai_url == "/v1/audio/transcriptions":
+                await self.load_whisper_model(model)
+                filename = await self.download_tmp_file(req.openai_req["file"])
+                # whisper.transcribe is not async wrap it to make it works
+                loop = asyncio.get_running_loop()
+                result = await loop.run_in_executor(None, lambda: self.whisper.transcribe(filename))
+                await self.ws_send(json.dumps({"text": result["text"]}), True)
             elif req.openai_req.get("stream"):
                 await self.load_model(model)
                 async with aconnect_sse(self.llama_cli, "POST", req.openai_url, json=req.openai_req) as sse:
@@ -469,7 +505,7 @@ async def handle_image_generation(self, request_data):
 
     async def get_model(self, name):
         return await self.download_model(name)
-
+        
     async def download_file(self, url: str) -> str:
         output_file = url_to_tempfile(self.conf, url)
         if not os.path.exists(output_file):

diff --git a/build-bin.sh b/build-bin.sh
@@ -3,7 +3,7 @@
 gpu="$1"
 arch="$2"
 cmake="$3"
-opts="--onefile"
+opts="--onedir"
 
 if [ -z "$cmake" -o -z "$gpu" ]; then
     echo usage build-bin.sh gpu arch "cmake-args"
@@ -36,6 +36,8 @@ rm -f ~/.cache/pypoetry/artifacts/*/*/*/*/llama*
 
 CMAKE_ARGS="$cmake" FORCE_CMAKE=1 poetry install $with_torch
 
+poetry run pip install openai-whisper
+
 if [ "$gpu" == "cuda-torch" ]; then
     # annoying hack because fastembed is probably a bad idea
     pip install transformers==4.35.0

diff --git a/hooks/hook-whisper.py b/hooks/hook-whisper.py
@@ -0,0 +1,26 @@
+from PyInstaller.utils.hooks import collect_data_files, get_package_paths
+import os
+
+# Get the package path
+package_path = get_package_paths('nvidia')[0]
+
+# Collect data files
+datas = collect_data_files('nvidia')
+
+# Append the additional .dll or .so file
+if os.name == 'nt':  # Windows
+    # need to check because i dont have a gpu with windows to test
+    dll_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_ops_infer.dll')
+    datas.append((dll_path, '.'))
+    dll_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_cnn_infer.dll')
+    datas.append((dll_path, '.'))
+elif os.name == 'posix':  # Linux/Mac
+    so_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_ops_infer.so.8')
+    datas.append((so_path, '.'))
+    so_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_cnn_infer.so.8')
+    datas.append((so_path, '.'))
+
+
+assets = os.path.join(package_path, "whisper", 'assets')
+datas.append((assets, './whisper/assets'))
+