更新部分说明

2024-06-08 13:26:43 +08:00 · 2024-06-08 13:26:43 +08:00 · 20a9f26ec6
parent 97c8bf8f45
commit 20a9f26ec6
7 changed files with 35 additions and 109 deletions
--- a/basic_demo/README.md
+++ b/basic_demo/README.md
@ -16,7 +16,7 @@ Read this in [English](README_en.md).
 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 均已测试
 + CUDA Version:  12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/basic_demo/README_en.md
+++ b/basic_demo/README_en.md
@ -16,7 +16,7 @@ Test hardware information:
 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 have been tested
 + CUDA Version: 12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/basic_demo/openai_api_request.py
+++ b/basic_demo/openai_api_request.py
@ -8,8 +8,8 @@ base_url = "http://127.0.0.1:8000/v1/"
 client = OpenAI(api_key="EMPTY", base_url=base_url)
-def function_chat():
+def function_chat(use_stream=False):
-    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
+    messages = [{"role": "user", "content": "What's the Celsius temperature in San Francisco?"}]
    tools = [
        {
            "type": "function",
@ -47,17 +47,24 @@ def function_chat():
        model="glm-4",
        messages=messages,
        tools=tools,
-        stream=False, # must use False
+        stream=use_stream,
        max_tokens=256,
        temperature=0.9,
        presence_penalty=1.2,
        top_p=0.1,
        tool_choice="auto",  # use "auto" to let the model choose the tool automatically
        # tool_choice={"type": "function", "function": {"name": "my_function"}},
    )
    if response:
-        print(response.choices[0].message)
+        if use_stream:
            for chunk in response:
                print(chunk)
        else:
            print(response)
    else:
        print("Error:", response.status_code)
 def simple_chat(use_stream=False):
    messages = [
        {
@ -74,20 +81,20 @@ def simple_chat(use_stream=False):
        messages=messages,
        stream=use_stream,
        max_tokens=256,
-        temperature=0.1,
+        temperature=0.4,
-        presence_penalty=1.1,
+        presence_penalty=1.2,
-        top_p=0.8)
+        top_p=0.8,
    )
    if response:
        if use_stream:
            for chunk in response:
-                print(chunk.choices[0].delta.content)
+                print(chunk)
        else:
-            content = response.choices[0].message.content
+            print(response)
            print(content)
    else:
        print("Error:", response.status_code)
 if __name__ == "__main__":
-    # simple_chat(use_stream=False)
+    simple_chat(use_stream=True)
-    function_chat()
+    # function_chat(use_stream=False) # Only False is supported
--- a/basic_demo/openai_api_server.py
+++ b/basic_demo/openai_api_server.py
@ -286,7 +286,6 @@ def process_messages(messages, tools=None, tool_choice="none"):
            if m.role == 'system':
                messages.insert(0, {"role": m.role, "content": m.content})
                break
    return messages
@ -334,19 +333,12 @@ async def create_chat_completion(request: ChatCompletionRequest):
            except:
                logger.warning("Failed to parse tool call")
        # CallFunction
        if isinstance(function_call, dict):
            function_call = FunctionCallResponse(**function_call)
-            tool_response = ""
+            generate = parse_output_text(request.model, output, function_call=function_call)
            if not gen_params.get("messages"):
                gen_params["messages"] = []
            gen_params["messages"].append(ChatMessage(role="assistant", content=output))
            gen_params["messages"].append(ChatMessage(role="tool", name=function_call.name, content=tool_response))
            generate = predict(request.model, gen_params)
            return EventSourceResponse(generate, media_type="text/event-stream")
        else:
-            generate = parse_output_text(request.model, output)
+            return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
            return EventSourceResponse(generate, media_type="text/event-stream")
    response = ""
    async for response in generate_stream_glm4(gen_params):
@ -405,77 +397,6 @@ async def create_chat_completion(request: ChatCompletionRequest):
        usage=usage
    )
 async def predict(model_id: str, params: dict):
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    previous_text = ""
    async for new_response in generate_stream_glm4(params):
        decoded_unicode = new_response["text"]
        delta_text = decoded_unicode[len(previous_text):]
        previous_text = decoded_unicode
        finish_reason = new_response["finish_reason"]
        if len(delta_text) == 0 and finish_reason != "tool_calls":
            continue
        function_call = None
        if finish_reason == "tool_calls":
            try:
                function_call = process_response(decoded_unicode, use_tool=True)
            except:
                logger.warning(
                    "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
        if isinstance(function_call, dict):
            function_call = FunctionCallResponse(**function_call)
        delta = DeltaMessage(
            content=None,
            role="assistant",
            function_call=None,
            tool_calls=[{
                "id": f"call_{int(time.time() * 1000)}",
                "index": 0,
                "type": "function",
                "function": function_call
            }] if isinstance(function_call, FunctionCallResponse) else None,
        )
        choice_data = ChatCompletionResponseStreamChoice(
            index=0,
            delta=delta,
            finish_reason=finish_reason
        )
        chunk = ChatCompletionResponse(
            model=model_id,
            id="",
            choices=[choice_data],
            object="chat.completion.chunk"
        )
        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(
        model=model_id,
        id="",
        choices=[choice_data],
        object="chat.completion.chunk"
    )
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'
 async def predict_stream(model_id, gen_params):
    output = ""
    is_function_call = False
@ -537,24 +458,20 @@ async def predict_stream(model_id, gen_params):
        yield '[DONE]'
-async def parse_output_text(model_id: str, value: str):
+async def parse_output_text(model_id: str, value: str, function_call: FunctionCallResponse = None):
    delta = DeltaMessage(role="assistant", content=value)
    if function_call is not None:
        delta.function_call = function_call
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
-        delta=DeltaMessage(role="assistant", content=value),
+        delta=delta,
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'
 if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    engine_args = AsyncEngineArgs(
@ -563,7 +480,8 @@ if __name__ == "__main__":
        tensor_parallel_size=1,
        dtype="bfloat16",
        trust_remote_code=True,
-        gpu_memory_utilization=0.3,
+        # 占用显存的比例，请根据你的显卡显存大小设置合适的值，例如，如果你的显卡有80G，您只想使用24G，请按照24/80=0.3设置
        gpu_memory_utilization=0.9,
        enforce_eager=True,
        worker_use_ray=False,
        engine_use_ray=False,
--- a/composite_demo/requirements.txt
+++ b/composite_demo/requirements.txt
@ -1,3 +1,4 @@
 # Please install the requirments.txt in basic_demo first!
 # use vllm
 # vllm>=0.4.3
--- a/finetune_demo/README.md
+++ b/finetune_demo/README.md
@ -11,7 +11,7 @@ Read this in [English](README_en.md)
 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3  目前需要使用 git 源码安装 nltk)
 + CUDA Version:  12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/finetune_demo/README_en.md
+++ b/finetune_demo/README_en.md
@ -12,7 +12,7 @@ Test hardware information:
 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python:  Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python 3.12.3)
 + CUDA Version: 12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8