Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 39 additions & 3 deletions ai_worker/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from httpx_sse import aconnect_sse
from llama_cpp.server.app import Settings as LlamaSettings, create_app as create_llama_app
import llama_cpp.server.app
import whisper
from pydantic import BaseModel, Field
from pydantic_settings import BaseSettings, SettingsConfigDict
from pynvml.smi import nvidia_smi
Expand Down Expand Up @@ -153,9 +154,11 @@ def __init__(self, conf: Config):
self.slug = ""
self.stopped = False
self.llama = None
self.whisper = None
self.llama_model = None
self.whisper_model = None
self.llama_cli: Optional[AsyncClient] = None

if FineTuner:
self.fine_tuner = FineTuner(self.conf)
else:
Expand Down Expand Up @@ -270,6 +273,18 @@ def clear_llama_model(self):
self.llama_cli = None
self.llama_model = None

def clear_whisper_model(self):
self.whisper = None
self.whisper_model = None

async def load_whisper_model(self, name):
assert name, "No model name"
if name == self.whisper_model:
return
log.debug("loading model: %s", name)

self.whisper = whisper.load_model(name)
self.whisper_model = name

async def load_model(self, name):
assert name, "No model name"
Expand All @@ -295,13 +310,15 @@ async def load_model(self, name):
assert self.llama, "Load llama failed. Try lowering layers."
self.llama_cli = AsyncClient(app=self.llama, base_url="http://test")
self.llama_model = name


def _get_connect_info(self) -> ConnectMessage:
disk_space = get_free_space_mb(".")

caps = []

caps += ['llama-infer']
caps += ["whisper"]

if self.fine_tuner:
caps += ["llama-fine-tune"]
Expand Down Expand Up @@ -412,14 +429,26 @@ async def run_ws(self):
await asyncio.sleep(1)
self.stopped = True

async def download_tmp_file(self, url: str) -> str:
import urllib.parse
res = urllib.parse.urlparse(self.conf.queen_url)
scheme = "https" if res.scheme == "wss" else "http"
async with AsyncClient() as cli:
async with cli.stream('GET', f"{scheme}://{res.netloc}/storage/?filename={url}") as response:
with tempfile.NamedTemporaryFile("wb", delete=False) as download_file:
async for chunk in response.aiter_bytes():
download_file.write(chunk)
download_file.close()
return download_file.name


async def run_one(self):
event = None
req_str = None
try:
req_str = await self.ws_recv()
req = Req.model_validate_json(req_str)
model = req.openai_req.get("model")

log.debug("loading %s", model)

st = time.monotonic()
Expand All @@ -435,6 +464,13 @@ async def run_one(self):
elif req.openai_url == "/v1/embeddings" and model.startswith(MODEL_PREFIX):
res = self.fast_embed.embed(req.openai_req)
await self.ws_send(json.dumps(res), True)
elif req.openai_url == "/v1/audio/transcriptions":
await self.load_whisper_model(model)
filename = await self.download_tmp_file(req.openai_req["file"])
# whisper.transcribe is not async wrap it to make it works
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, lambda: self.whisper.transcribe(filename))
await self.ws_send(json.dumps({"text": result["text"]}), True)
elif req.openai_req.get("stream"):
await self.load_model(model)
async with aconnect_sse(self.llama_cli, "POST", req.openai_url, json=req.openai_req) as sse:
Expand Down Expand Up @@ -469,7 +505,7 @@ async def handle_image_generation(self, request_data):

async def get_model(self, name):
return await self.download_model(name)

async def download_file(self, url: str) -> str:
output_file = url_to_tempfile(self.conf, url)
if not os.path.exists(output_file):
Expand Down
4 changes: 3 additions & 1 deletion build-bin.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
gpu="$1"
arch="$2"
cmake="$3"
opts="--onefile"
opts="--onedir"

if [ -z "$cmake" -o -z "$gpu" ]; then
echo usage build-bin.sh gpu arch "cmake-args"
Expand Down Expand Up @@ -36,6 +36,8 @@ rm -f ~/.cache/pypoetry/artifacts/*/*/*/*/llama*

CMAKE_ARGS="$cmake" FORCE_CMAKE=1 poetry install $with_torch

poetry run pip install openai-whisper

if [ "$gpu" == "cuda-torch" ]; then
# annoying hack because fastembed is probably a bad idea
pip install transformers==4.35.0
Expand Down
26 changes: 26 additions & 0 deletions hooks/hook-whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from PyInstaller.utils.hooks import collect_data_files, get_package_paths
import os

# Get the package path
package_path = get_package_paths('nvidia')[0]

# Collect data files
datas = collect_data_files('nvidia')

# Append the additional .dll or .so file
if os.name == 'nt': # Windows
# need to check because i dont have a gpu with windows to test
dll_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_ops_infer.dll')
datas.append((dll_path, '.'))
dll_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_cnn_infer.dll')
datas.append((dll_path, '.'))
elif os.name == 'posix': # Linux/Mac
so_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_ops_infer.so.8')
datas.append((so_path, '.'))
so_path = os.path.join(package_path, 'nvidia', 'cudnn', 'lib', 'libcudnn_cnn_infer.so.8')
datas.append((so_path, '.'))


assets = os.path.join(package_path, "whisper", 'assets')
datas.append((assets, './whisper/assets'))

Loading