buggy_agent/code_model.py at main · Max0072/buggy_agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import torch
from threading import Thread


def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")


class CodingModel:
    model_name = "Qwen/Qwen3-0.6B"

    def __init__(self):
        device = get_device()
        dtype = torch.float16 if device == "mps" else torch.float32
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
        self.streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=True, skip_prompt=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            attn_implementation="eager",
            device_map=None
        )

    def prepare_model_input(self, messages):
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False # thinking switch
        )
        model_inputs = self.tokenizer([text], return_tensors="pt", truncation=True, max_length=8192).to(self.model.device)
        return model_inputs

    def forward(self, model_inputs):
        generated_ids = self.model.generate(**model_inputs, max_new_tokens = 1024)
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
        try:
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            index = 0
        content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
        return content

    # forward with real time text translation
    def forward_with_thread(self, model_inputs):
        print("Forward pass...")
        gen_kwargs = dict(**model_inputs,
                          max_new_tokens=1024,
                          do_sample=False,
                          use_cache=True,
                          pad_token_id=self.tokenizer.pad_token_id,
                          )

        thread = Thread(target=self.model.generate,
                        kwargs={**gen_kwargs, "streamer": self.streamer})
        thread.start()
        content = ""
        for text_chunk in self.streamer:
            content += text_chunk
            print(text_chunk, end="", flush=True)
        thread.join()

        return content