更新部分说明

2024-06-08 13:26:43 +08:00 · 2024-06-08 13:26:43 +08:00 · 20a9f26ec6
parent 97c8bf8f45
commit 20a9f26ec6
7 changed files with 35 additions and 109 deletions
--- a/basic_demo/README.md
+++ b/basic_demo/README.md
@ -16,7 +16,7 @@ Read this in [English](README_en.md).

 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 均已测试
 + CUDA Version:  12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/basic_demo/README_en.md
+++ b/basic_demo/README_en.md
@ -16,7 +16,7 @@ Test hardware information:

 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 have been tested
 + CUDA Version: 12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/basic_demo/openai_api_request.py
+++ b/basic_demo/openai_api_request.py
@ -8,8 +8,8 @@ base_url = "http://127.0.0.1:8000/v1/"
 client = OpenAI(api_key="EMPTY", base_url=base_url)


-def function_chat():
-    messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
+def function_chat(use_stream=False):
+    messages = [{"role": "user", "content": "What's the Celsius temperature in San Francisco?"}]
    tools = [
        {
            "type": "function",
@ -47,17 +47,24 @@ def function_chat():
        model="glm-4",
        messages=messages,
        tools=tools,
-        stream=False, # must use False
+        stream=use_stream,
+        max_tokens=256,
+        temperature=0.9,
+        presence_penalty=1.2,
+        top_p=0.1,
        tool_choice="auto",  # use "auto" to let the model choose the tool automatically
        # tool_choice={"type": "function", "function": {"name": "my_function"}},
    )
    if response:
-        print(response.choices[0].message)
+        if use_stream:
+            for chunk in response:
+                print(chunk)
+        else:
+            print(response)
    else:
        print("Error:", response.status_code)


-
 def simple_chat(use_stream=False):
    messages = [
        {
@ -74,20 +81,20 @@ def simple_chat(use_stream=False):
        messages=messages,
        stream=use_stream,
        max_tokens=256,
-        temperature=0.1,
-        presence_penalty=1.1,
-        top_p=0.8)
+        temperature=0.4,
+        presence_penalty=1.2,
+        top_p=0.8,
+    )
    if response:
        if use_stream:
            for chunk in response:
-                print(chunk.choices[0].delta.content)
+                print(chunk)
        else:
-            content = response.choices[0].message.content
-            print(content)
+            print(response)
    else:
        print("Error:", response.status_code)


 if __name__ == "__main__":
-    # simple_chat(use_stream=False)
-    function_chat()
+    simple_chat(use_stream=True)
+    # function_chat(use_stream=False) # Only False is supported
--- a/basic_demo/openai_api_server.py
+++ b/basic_demo/openai_api_server.py
@ -286,7 +286,6 @@ def process_messages(messages, tools=None, tool_choice="none"):
            if m.role == 'system':
                messages.insert(0, {"role": m.role, "content": m.content})
                break
-
    return messages


@ -334,19 +333,12 @@ async def create_chat_completion(request: ChatCompletionRequest):
            except:
                logger.warning("Failed to parse tool call")

-        # CallFunction
        if isinstance(function_call, dict):
            function_call = FunctionCallResponse(**function_call)
-            tool_response = ""
-            if not gen_params.get("messages"):
-                gen_params["messages"] = []
-            gen_params["messages"].append(ChatMessage(role="assistant", content=output))
-            gen_params["messages"].append(ChatMessage(role="tool", name=function_call.name, content=tool_response))
-            generate = predict(request.model, gen_params)
+            generate = parse_output_text(request.model, output, function_call=function_call)
            return EventSourceResponse(generate, media_type="text/event-stream")
        else:
-            generate = parse_output_text(request.model, output)
-            return EventSourceResponse(generate, media_type="text/event-stream")
+            return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")

    response = ""
    async for response in generate_stream_glm4(gen_params):
@ -405,77 +397,6 @@ async def create_chat_completion(request: ChatCompletionRequest):
        usage=usage
    )

-
-async def predict(model_id: str, params: dict):
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(role="assistant"),
-        finish_reason=None
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    previous_text = ""
-    async for new_response in generate_stream_glm4(params):
-        decoded_unicode = new_response["text"]
-        delta_text = decoded_unicode[len(previous_text):]
-        previous_text = decoded_unicode
-
-        finish_reason = new_response["finish_reason"]
-        if len(delta_text) == 0 and finish_reason != "tool_calls":
-            continue
-
-        function_call = None
-        if finish_reason == "tool_calls":
-            try:
-                function_call = process_response(decoded_unicode, use_tool=True)
-            except:
-                logger.warning(
-                    "Failed to parse tool call, maybe the response is not a tool call or have been answered.")
-
-        if isinstance(function_call, dict):
-            function_call = FunctionCallResponse(**function_call)
-
-        delta = DeltaMessage(
-            content=None,
-            role="assistant",
-            function_call=None,
-            tool_calls=[{
-                "id": f"call_{int(time.time() * 1000)}",
-                "index": 0,
-                "type": "function",
-                "function": function_call
-            }] if isinstance(function_call, FunctionCallResponse) else None,
-        )
-
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=0,
-            delta=delta,
-            finish_reason=finish_reason
-        )
-        chunk = ChatCompletionResponse(
-            model=model_id,
-            id="",
-            choices=[choice_data],
-            object="chat.completion.chunk"
-        )
-        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
-    )
-    chunk = ChatCompletionResponse(
-        model=model_id,
-        id="",
-        choices=[choice_data],
-        object="chat.completion.chunk"
-    )
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    yield '[DONE]'
-
-
 async def predict_stream(model_id, gen_params):
    output = ""
    is_function_call = False
@ -537,24 +458,20 @@ async def predict_stream(model_id, gen_params):
        yield '[DONE]'


-async def parse_output_text(model_id: str, value: str):
+async def parse_output_text(model_id: str, value: str, function_call: FunctionCallResponse = None):
+    delta = DeltaMessage(role="assistant", content=value)
+    if function_call is not None:
+        delta.function_call = function_call
+
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
-        delta=DeltaMessage(role="assistant", content=value),
+        delta=delta,
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
-    choice_data = ChatCompletionResponseStreamChoice(
-        index=0,
-        delta=DeltaMessage(),
-        finish_reason="stop"
-    )
-    chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
-    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'

-
 if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    engine_args = AsyncEngineArgs(
@ -563,7 +480,8 @@ if __name__ == "__main__":
        tensor_parallel_size=1,
        dtype="bfloat16",
        trust_remote_code=True,
-        gpu_memory_utilization=0.3,
+        # 占用显存的比例，请根据你的显卡显存大小设置合适的值，例如，如果你的显卡有80G，您只想使用24G，请按照24/80=0.3设置
+        gpu_memory_utilization=0.9,
        enforce_eager=True,
        worker_use_ray=False,
        engine_use_ray=False,
--- a/composite_demo/requirements.txt
+++ b/composite_demo/requirements.txt
@ -1,3 +1,4 @@
+# Please install the requirments.txt in basic_demo first!
 # use vllm
 # vllm>=0.4.3

--- a/finetune_demo/README.md
+++ b/finetune_demo/README.md
@ -11,7 +11,7 @@ Read this in [English](README_en.md)

 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3  目前需要使用 git 源码安装 nltk)
 + CUDA Version:  12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8
--- a/finetune_demo/README_en.md
+++ b/finetune_demo/README_en.md
@ -12,7 +12,7 @@ Test hardware information:

 + OS: Ubuntu 22.04
 + Memory: 512GB
-+ Python: 3.12.3
+ Python:  Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python 3.12.3)
 + CUDA Version: 12.3
 + GPU Driver: 535.104.05
 + GPU: NVIDIA A100-SXM4-80GB * 8