add openai demo stream and function call

fix #130 #124
2024-06-09 16:11:20 +08:00 · 2024-06-09 16:11:20 +08:00 · a9fe1aba02
parent 7c94294acc
commit a9fe1aba02
5 changed files with 162 additions and 79 deletions
--- a/basic_demo/README.md
+++ b/basic_demo/README.md
@ -16,7 +16,7 @@ Read this in [English](README_en.md).
 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.10.12 / 3.12.3 均已测试
+ Python: 3.10.12 （推荐） / 3.12.3 均已测试
 + CUDA Version:  12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/basic_demo/README_en.md
+++ b/basic_demo/README_en.md
@ -16,7 +16,7 @@ Test hardware information:
 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.10.12 / 3.12.3 have been tested
+ Python: 3.10.12 (recommend) / 3.12.3 have been tested
 + CUDA Version: 12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/basic_demo/openai_api_request.py
+++ b/basic_demo/openai_api_request.py
@ -9,7 +9,34 @@ client = OpenAI(api_key="EMPTY", base_url=base_url)
 def function_chat(use_stream=False):
-    messages = [{"role": "user", "content": "What's the Celsius temperature in San Francisco?"}]
+    messages = [
        {
            "role": "user", "content": "What's the Celsius temperature in San Francisco?"
        },
        # Give Observations
        # {
        #     "role": "assistant",
        #         "content": None,
        #         "function_call": None,
        #         "tool_calls": [
        #             {
        #                 "id": "call_1717912616815",
        #                 "function": {
        #                     "name": "get_current_weather",
        #                     "arguments": "{\"location\": \"San Francisco, CA\", \"format\": \"celsius\"}"
        #                 },
        #                 "type": "function"
        #             }
        #         ]
        # },
        # {
        #     "tool_call_id": "call_1717912616815",
        #     "role": "tool",
        #     "name": "get_current_weather",
        #     "content": "23°C",
        # }
    ]
    tools = [
        {
            "type": "function",
@ -35,11 +62,11 @@ def function_chat(use_stream=False):
        },
    ]
-    # # All Tools 能力: 绘图
+    # All Tools: CogView
    # messages = [{"role": "user", "content": "帮我画一张天空的画画吧"}]
    # tools = [{"type": "cogview"}]
-    #
+
-    # All Tools 能力: 联网查询
+    # All Tools: Searching
    # messages = [{"role": "user", "content": "今天黄金的价格"}]
    # tools = [{"type": "simple_browser"}]
@ -52,8 +79,7 @@ def function_chat(use_stream=False):
        temperature=0.9,
        presence_penalty=1.2,
        top_p=0.1,
-        tool_choice="auto",  # use "auto" to let the model choose the tool automatically
+        tool_choice="auto"
        # tool_choice={"type": "function", "function": {"name": "my_function"}},
    )
    if response:
        if use_stream:
@ -73,7 +99,7 @@ def simple_chat(use_stream=False):
        },
        {
            "role": "user",
-            "content": "你好，你是谁"
+            "content": "你是谁"
        }
    ]
    response = client.chat.completions.create(
@ -96,5 +122,6 @@ def simple_chat(use_stream=False):
 if __name__ == "__main__":
-    simple_chat(use_stream=True)
+    # simple_chat(use_stream=False)
-    # function_chat(use_stream=False) # Only False is supported
+    function_chat(use_stream=False)
--- a/basic_demo/openai_api_server.py
+++ b/basic_demo/openai_api_server.py
@ -1,6 +1,6 @@
 import time
 from asyncio.log import logger
-
+import re
 import uvicorn
 import gc
 import json
@ -135,13 +135,15 @@ class InvalidScoreLogitsProcessor(LogitsProcessor):
 def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
    lines = output.strip().split("\n")
    arguments_json = None
    special_tools = ["cogview", "simple_browser"]
-    if len(lines) == 2:
+    tool_call_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
    if len(lines) >= 2 and tool_call_pattern.match(lines[0]):
        function_name = lines[0].strip()
-        arguments = lines[1].strip()
+        arguments = "\n".join(lines[1:]).strip()
        special_tools = ["cogview", "simple_browser"]
        arguments_json = None
        try:
            arguments_json = json.loads(arguments)
            is_tool_call = True
@ -151,23 +153,28 @@ def process_response(output: str, use_tool: bool = False) -> Union[str, dict]:
        if is_tool_call and use_tool:
            content = {
                "name": function_name,
-                "arguments": json.dumps(arguments_json if isinstance(arguments_json, dict) else arguments,
+                "arguments": json.dumps(arguments_json if isinstance(arguments_json, dict) else arguments, ensure_ascii=False)
                                        ensure_ascii=False)
            }
-            if function_name in special_tools:
+            if function_name == "simple_browser":
-                content["text"] = arguments
+                search_pattern = re.compile(r'search\("(.+?)"\s*,\s*recency_days\s*=\s*(\d+)\)')
-            return content
+                match = search_pattern.match(arguments)
-        elif is_tool_call:
+                if match:
-            content = {
+                    content["arguments"] = json.dumps({
-                "name": function_name,
+                        "query": match.group(1),
-                "content": json.dumps(arguments_json if isinstance(arguments_json, dict) else arguments,
+                        "recency_days": int(match.group(2))
-                                      ensure_ascii=False)
+                    }, ensure_ascii=False)
-            }
+            elif function_name == "cogview":
-            return content
+                content["arguments"] = json.dumps({
                    "prompt": arguments
                }, ensure_ascii=False)
            return content
    return output.strip()
@torch.inference_mode()
 async def generate_stream_glm4(params):
    messages = params["messages"]
@ -177,6 +184,7 @@ async def generate_stream_glm4(params):
    repetition_penalty = float(params.get("repetition_penalty", 1.0))
    top_p = float(params.get("top_p", 1.0))
    max_new_tokens = int(params.get("max_tokens", 8192))
    messages = process_messages(messages, tools=tools, tool_choice=tool_choice)
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    params_dict = {
@ -218,7 +226,7 @@ async def generate_stream_glm4(params):
 def process_messages(messages, tools=None, tool_choice="none"):
    _messages = messages
-    messages = []
+    processed_messages = []
    msg_has_sys = False
    def filter_tools(tool_choice, tools):
@ -235,7 +243,7 @@ def process_messages(messages, tools=None, tool_choice="none"):
        if isinstance(tool_choice, dict):
            tools = filter_tools(tool_choice, tools)
        if tools:
-            messages.append(
+            processed_messages.append(
                {
                    "role": "system",
                    "content": None,
@ -245,7 +253,7 @@ def process_messages(messages, tools=None, tool_choice="none"):
            msg_has_sys = True
    if isinstance(tool_choice, dict) and tools:
-        messages.append(
+        processed_messages.append(
            {
                "role": "assistant",
                "metadata": tool_choice["function"]["name"],
@ -255,38 +263,59 @@ def process_messages(messages, tools=None, tool_choice="none"):
    for m in _messages:
        role, content, func_call = m.role, m.content, m.function_call
        tool_calls = getattr(m, 'tool_calls', None)
        if role == "function":
-            messages.append(
+            processed_messages.append(
                {
                    "role": "observation",
                    "content": content
                }
            )
-        elif role == "assistant" and func_call is not None:
+        elif role == "tool":
-            for response in content.split("<|assistant|>"):
+            processed_messages.append(
-                if "\n" in response:
+                {
-                    metadata, sub_content = response.split("\n", maxsplit=1)
+                    "role": "observation",
-                else:
+                    "content": content,
-                    metadata, sub_content = "", response
+                    "function_call": True
-                messages.append(
+                }
-                    {
+            )
-                        "role": role,
+        elif role == "assistant":
-                        "metadata": metadata,
+            if tool_calls:
-                        "content": sub_content.strip()
+                for tool_call in tool_calls:
-                    }
+                    processed_messages.append(
-                )
+                        {
                            "role": "assistant",
                            "metadata": tool_call.function.name,
                            "content": tool_call.function.arguments
                        }
                    )
            else:
                for response in content.split("\n"):
                    if "\n" in response:
                        metadata, sub_content = response.split("\n", maxsplit=1)
                    else:
                        metadata, sub_content = "", response
                    processed_messages.append(
                        {
                            "role": role,
                            "metadata": metadata,
                            "content": sub_content.strip()
                        }
                    )
        else:
            if role == "system" and msg_has_sys:
                msg_has_sys = False
                continue
-            messages.append({"role": role, "content": content})
+            processed_messages.append({"role": role, "content": content})
    if not tools or tool_choice == "none":
        for m in _messages:
            if m.role == 'system':
-                messages.insert(0, {"role": m.role, "content": m.content})
+                processed_messages.insert(0, {"role": m.role, "content": m.content})
                break
-    return messages
+    return processed_messages
@app.get("/health")
@ -306,6 +335,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
        raise HTTPException(status_code=400, detail="Invalid request")
    gen_params = dict(
        messages=request.messages,
        temperature=request.temperature,
@ -391,29 +421,76 @@ async def create_chat_completion(request: ChatCompletionRequest):
    return ChatCompletionResponse(
        model=request.model,
-        id="",  # for open_source model, id is empty
+        id="",
        choices=[choice_data],
        object="chat.completion",
        usage=usage
    )
 async def predict_stream(model_id, gen_params):
    output = ""
    is_function_call = False
    has_send_first_chunk = False
    function_name = None
    async for new_response in generate_stream_glm4(gen_params):
        decoded_unicode = new_response["text"]
        delta_text = decoded_unicode[len(output):]
        output = decoded_unicode
        lines = output.strip().split("\n")
        if not is_function_call and len(lines) >= 2:
            is_function_call = True
-        if not is_function_call and len(output) > 7:
+        if not is_function_call and len(lines) >= 2 and re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', lines[0]):
-            finish_reason = new_response["finish_reason"]
+            is_function_call = True
-            if not has_send_first_chunk:
+            function_name = lines[0].strip()
        if is_function_call:
            for char in delta_text:
                function_call = {"name": function_name, "arguments": char}
                message = DeltaMessage(
-                    content="",
+                    content=None,
                    role="assistant",
                    function_call=function_call
                )
                choice_data = ChatCompletionResponseStreamChoice(
                    index=0,
                    delta=message,
                    finish_reason=None
                )
                chunk = ChatCompletionResponse(
                    model=model_id,
                    id="",
                    choices=[choice_data],
                    created=int(time.time()),
                    object="chat.completion.chunk"
                )
                yield chunk.model_dump_json(exclude_unset=True)
        else:
            if len(output) > 7:
                finish_reason = new_response.get("finish_reason", None)
                if not has_send_first_chunk:
                    message = DeltaMessage(
                        content="",
                        role="assistant",
                        function_call=None,
                    )
                    choice_data = ChatCompletionResponseStreamChoice(
                        index=0,
                        delta=message,
                        finish_reason=finish_reason
                    )
                    chunk = ChatCompletionResponse(
                        model=model_id,
                        id="",
                        choices=[choice_data],
                        created=int(time.time()),
                        object="chat.completion.chunk"
                    )
                    yield chunk.model_dump_json(exclude_unset=True)
                send_msg = delta_text if has_send_first_chunk else output
                has_send_first_chunk = True
                message = DeltaMessage(
                    content=send_msg,
                    role="assistant",
                    function_call=None,
                )
@ -429,31 +506,10 @@ async def predict_stream(model_id, gen_params):
                    created=int(time.time()),
                    object="chat.completion.chunk"
                )
-                yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+                yield chunk.model_dump_json(exclude_unset=True)
            send_msg = delta_text if has_send_first_chunk else output
            has_send_first_chunk = True
            message = DeltaMessage(
                content=send_msg,
                role="assistant",
                function_call=None,
            )
            choice_data = ChatCompletionResponseStreamChoice(
                index=0,
                delta=message,
                finish_reason=finish_reason
            )
            chunk = ChatCompletionResponse(
                model=model_id,
                id="",
                choices=[choice_data],
                created=int(time.time()),
                object="chat.completion.chunk"
            )
            yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    if is_function_call:
-        yield output
+        yield json.dumps({"text": output})
    else:
        yield '[DONE]'
--- a/basic_demo/trans_cli_vision_demo.py
+++ b/basic_demo/trans_cli_vision_demo.py
@ -112,7 +112,7 @@ if __name__ == "__main__":
        }
        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()
-        print("GLM-4:", end="", flush=True)
+        print("GLM-4V:", end="", flush=True)
        for new_token in streamer:
            if new_token:
                print(new_token, end="", flush=True)