forked from sums001/Windows-Copilot-API
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapi.py
More file actions
131 lines (110 loc) · 4.6 KB
/
Copy pathapi.py
File metadata and controls
131 lines (110 loc) · 4.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""FastAPI app wiring Copilot onto the OpenAI Chat Completions API."""
import threading
import time
from fastapi import FastAPI
from fastapi.responses import JSONResponse, StreamingResponse
from copilot import CopilotClient
from .config import MODEL_NAME, RATE_LIMIT_BURST, RATE_LIMIT_RPM
from .openai_format import (
completion_response,
new_id,
sse_event,
stream_chunk,
)
from .prompt import messages_to_prompt
from .ratelimit import TokenBucket
from .schemas import ChatCompletionRequest
app = FastAPI(title="Copilot OpenAI-compatible API", version="1.0.0")
client = CopilotClient()
# Self-imposed rate limit on top of the concurrency lock below: this caps
# requests-per-minute, the lock caps requests-in-flight. See server/ratelimit.py.
_rate_limiter = TokenBucket(RATE_LIMIT_RPM, RATE_LIMIT_BURST)
def _rate_limited_response():
"""Spend a token; return an OpenAI-shaped 429 if none left, else ``None``."""
allowed, wait = _rate_limiter.try_acquire()
if allowed:
return None
secs = max(1, round(wait))
return JSONResponse(
status_code=429,
headers={"Retry-After": str(secs)},
content={"error": {
"message": (
f"Rate limit exceeded (>{RATE_LIMIT_RPM:g} req/min). "
f"Retry in {secs}s."
),
"type": "rate_limit_error",
"code": "rate_limit_exceeded",
}},
)
# Copilot's per-account chat socket doesn't tolerate concurrent conversations
# from one process (parallel requests error out or hang). This server bridges a
# single signed-in account, so we serialize upstream calls: concurrent HTTP
# requests queue here and run one at a time. Predictable, at the cost of
# parallelism — fine for a personal bridge.
_upstream_lock = threading.Lock()
def _stream(prompt: str, model: str, conversation_id=None):
"""Yield OpenAI ``chat.completion.chunk`` SSE events for ``prompt``.
``conversation_id`` continues an existing Copilot thread; ``None`` starts a
fresh one (its id is emitted on the final chunk).
"""
cid = new_id()
created = int(time.time())
try:
with _upstream_lock: # one upstream chat at a time (released on disconnect)
yield sse_event(stream_chunk(cid, created, model, {"role": "assistant"}))
stream = client.stream(prompt, conversation_id=conversation_id)
for piece in stream:
if isinstance(piece, str) and piece:
yield sse_event(stream_chunk(cid, created, model, {"content": piece}))
# Copilot's conversation id is known once the stream has run; emit it
# on the final chunk so callers can track the upstream thread.
yield sse_event(
stream_chunk(
cid, created, model, {}, finish="stop",
conversation_id=stream.conversation_id,
)
)
except Exception as exc: # surface errors to the client instead of hanging
yield sse_event(
stream_chunk(cid, created, model, {"content": f"\n[error: {exc}]"}, finish="error")
)
yield "data: [DONE]\n\n"
@app.get("/v1/models")
def list_models():
return {
"object": "list",
"data": [
{"id": MODEL_NAME, "object": "model", "created": 0, "owned_by": "microsoft"}
],
}
@app.post("/v1/chat/completions")
def chat_completions(req: ChatCompletionRequest):
prompt = messages_to_prompt(req.messages)
if not prompt.strip():
return JSONResponse(
status_code=400,
content={"error": {"message": "no text content in messages", "type": "invalid_request_error"}},
)
model = req.model or MODEL_NAME
# Enforce the per-minute ceiling before touching the upstream lock, so excess
# callers get a fast 429 instead of piling up behind the serialized queue.
limited = _rate_limited_response()
if limited is not None:
return limited
if req.stream:
return StreamingResponse(
_stream(prompt, model, req.conversation_id), media_type="text/event-stream"
)
try:
with _upstream_lock: # serialize: one upstream chat at a time
reply = client.chat(prompt, conversation_id=req.conversation_id)
except Exception as exc:
return JSONResponse(
status_code=502,
content={"error": {"message": str(exc), "type": "upstream_error"}},
)
return completion_response(reply.text, model, reply.conversation_id)
@app.get("/")
def root():
return {"service": "Copilot OpenAI-compatible API", "endpoints": ["/v1/models", "/v1/chat/completions"]}