更新部分说明
This commit is contained in:
parent
97c8bf8f45
commit
20a9f26ec6
|
@ -16,7 +16,7 @@ Read this in [English](README_en.md).
|
|||
|
||||
+ OS: Ubuntu 22.04
|
||||
+ Memory: 512GB
|
||||
+ Python: 3.12.3
|
||||
+ Python: 3.10.12 / 3.12.3 均已测试
|
||||
+ CUDA Version: 12.3
|
||||
+ GPU Driver: 535.104.05
|
||||
+ GPU: NVIDIA A100-SXM4-80GB * 8
|
||||
|
|
|
@ -16,7 +16,7 @@ Test hardware information:
|
|||
|
||||
+ OS: Ubuntu 22.04
|
||||
+ Memory: 512GB
|
||||
+ Python: 3.12.3
|
||||
+ Python: 3.10.12 / 3.12.3 have been tested
|
||||
+ CUDA Version: 12.3
|
||||
+ GPU Driver: 535.104.05
|
||||
+ GPU: NVIDIA A100-SXM4-80GB * 8
|
||||
|
|
|
@ -8,8 +8,8 @@ base_url = "http://127.0.0.1:8000/v1/"
|
|||
client = OpenAI(api_key="EMPTY", base_url=base_url)
|
||||
|
||||
|
||||
def function_chat():
|
||||
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
|
||||
def function_chat(use_stream=False):
|
||||
messages = [{"role": "user", "content": "What's the Celsius temperature in San Francisco?"}]
|
||||
tools = [
|
||||
{
|
||||
"type": "function",
|
||||
|
@ -47,17 +47,24 @@ def function_chat():
|
|||
model="glm-4",
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
stream=False, # must use False
|
||||
stream=use_stream,
|
||||
max_tokens=256,
|
||||
temperature=0.9,
|
||||
presence_penalty=1.2,
|
||||
top_p=0.1,
|
||||
tool_choice="auto", # use "auto" to let the model choose the tool automatically
|
||||
# tool_choice={"type": "function", "function": {"name": "my_function"}},
|
||||
)
|
||||
if response:
|
||||
print(response.choices[0].message)
|
||||
if use_stream:
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
else:
|
||||
print(response)
|
||||
else:
|
||||
print("Error:", response.status_code)
|
||||
|
||||
|
||||
|
||||
def simple_chat(use_stream=False):
|
||||
messages = [
|
||||
{
|
||||
|
@ -74,20 +81,20 @@ def simple_chat(use_stream=False):
|
|||
messages=messages,
|
||||
stream=use_stream,
|
||||
max_tokens=256,
|
||||
temperature=0.1,
|
||||
presence_penalty=1.1,
|
||||
top_p=0.8)
|
||||
temperature=0.4,
|
||||
presence_penalty=1.2,
|
||||
top_p=0.8,
|
||||
)
|
||||
if response:
|
||||
if use_stream:
|
||||
for chunk in response:
|
||||
print(chunk.choices[0].delta.content)
|
||||
print(chunk)
|
||||
else:
|
||||
content = response.choices[0].message.content
|
||||
print(content)
|
||||
print(response)
|
||||
else:
|
||||
print("Error:", response.status_code)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# simple_chat(use_stream=False)
|
||||
function_chat()
|
||||
simple_chat(use_stream=True)
|
||||
# function_chat(use_stream=False) # Only False is supported
|
||||
|
|
|
@ -286,7 +286,6 @@ def process_messages(messages, tools=None, tool_choice="none"):
|
|||
if m.role == 'system':
|
||||
messages.insert(0, {"role": m.role, "content": m.content})
|
||||
break
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
|
@ -334,19 +333,12 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
|||
except:
|
||||
logger.warning("Failed to parse tool call")
|
||||
|
||||
# CallFunction
|
||||
if isinstance(function_call, dict):
|
||||
function_call = FunctionCallResponse(**function_call)
|
||||
tool_response = ""
|
||||
if not gen_params.get("messages"):
|
||||
gen_params["messages"] = []
|
||||
gen_params["messages"].append(ChatMessage(role="assistant", content=output))
|
||||
gen_params["messages"].append(ChatMessage(role="tool", name=function_call.name, content=tool_response))
|
||||
generate = predict(request.model, gen_params)
|
||||
generate = parse_output_text(request.model, output, function_call=function_call)
|
||||
return EventSourceResponse(generate, media_type="text/event-stream")
|
||||
else:
|
||||
generate = parse_output_text(request.model, output)
|
||||
return EventSourceResponse(generate, media_type="text/event-stream")
|
||||
return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
|
||||
|
||||
response = ""
|
||||
async for response in generate_stream_glm4(gen_params):
|
||||
|
@ -405,77 +397,6 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
|||
usage=usage
|
||||
)
|
||||
|
||||
|
||||
async def predict(model_id: str, params: dict):
|
||||
choice_data = ChatCompletionResponseStreamChoice(
|
||||
index=0,
|
||||
delta=DeltaMessage(role="assistant"),
|
||||
finish_reason=None
|
||||
)
|
||||
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
|
||||
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
|
||||
|
||||
previous_text = ""
|
||||
async for new_response in generate_stream_glm4(params):
|
||||
decoded_unicode = new_response["text"]
|
||||
delta_text = decoded_unicode[len(previous_text):]
|
||||
previous_text = decoded_unicode
|
||||
|
||||
finish_reason = new_response["finish_reason"]
|
||||
if len(delta_text) == 0 and finish_reason != "tool_calls":
|
||||
continue
|
||||
|
||||
function_call = None
|
||||
if finish_reason == "tool_calls":
|
||||
try:
|
||||
function_call = process_response(decoded_unicode, use_tool=True)
|
||||
except:
|
||||
logger.warning(
|
||||
"Failed to parse tool call, maybe the response is not a tool call or have been answered.")
|
||||
|
||||
if isinstance(function_call, dict):
|
||||
function_call = FunctionCallResponse(**function_call)
|
||||
|
||||
delta = DeltaMessage(
|
||||
content=None,
|
||||
role="assistant",
|
||||
function_call=None,
|
||||
tool_calls=[{
|
||||
"id": f"call_{int(time.time() * 1000)}",
|
||||
"index": 0,
|
||||
"type": "function",
|
||||
"function": function_call
|
||||
}] if isinstance(function_call, FunctionCallResponse) else None,
|
||||
)
|
||||
|
||||
choice_data = ChatCompletionResponseStreamChoice(
|
||||
index=0,
|
||||
delta=delta,
|
||||
finish_reason=finish_reason
|
||||
)
|
||||
chunk = ChatCompletionResponse(
|
||||
model=model_id,
|
||||
id="",
|
||||
choices=[choice_data],
|
||||
object="chat.completion.chunk"
|
||||
)
|
||||
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
|
||||
|
||||
choice_data = ChatCompletionResponseStreamChoice(
|
||||
index=0,
|
||||
delta=DeltaMessage(),
|
||||
finish_reason="stop"
|
||||
)
|
||||
chunk = ChatCompletionResponse(
|
||||
model=model_id,
|
||||
id="",
|
||||
choices=[choice_data],
|
||||
object="chat.completion.chunk"
|
||||
)
|
||||
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
|
||||
yield '[DONE]'
|
||||
|
||||
|
||||
async def predict_stream(model_id, gen_params):
|
||||
output = ""
|
||||
is_function_call = False
|
||||
|
@ -537,24 +458,20 @@ async def predict_stream(model_id, gen_params):
|
|||
yield '[DONE]'
|
||||
|
||||
|
||||
async def parse_output_text(model_id: str, value: str):
|
||||
async def parse_output_text(model_id: str, value: str, function_call: FunctionCallResponse = None):
|
||||
delta = DeltaMessage(role="assistant", content=value)
|
||||
if function_call is not None:
|
||||
delta.function_call = function_call
|
||||
|
||||
choice_data = ChatCompletionResponseStreamChoice(
|
||||
index=0,
|
||||
delta=DeltaMessage(role="assistant", content=value),
|
||||
delta=delta,
|
||||
finish_reason=None
|
||||
)
|
||||
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
|
||||
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
|
||||
choice_data = ChatCompletionResponseStreamChoice(
|
||||
index=0,
|
||||
delta=DeltaMessage(),
|
||||
finish_reason="stop"
|
||||
)
|
||||
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
|
||||
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
|
||||
yield '[DONE]'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
|
||||
engine_args = AsyncEngineArgs(
|
||||
|
@ -563,7 +480,8 @@ if __name__ == "__main__":
|
|||
tensor_parallel_size=1,
|
||||
dtype="bfloat16",
|
||||
trust_remote_code=True,
|
||||
gpu_memory_utilization=0.3,
|
||||
# 占用显存的比例,请根据你的显卡显存大小设置合适的值,例如,如果你的显卡有80G,您只想使用24G,请按照24/80=0.3设置
|
||||
gpu_memory_utilization=0.9,
|
||||
enforce_eager=True,
|
||||
worker_use_ray=False,
|
||||
engine_use_ray=False,
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# Please install the requirments.txt in basic_demo first!
|
||||
# use vllm
|
||||
# vllm>=0.4.3
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ Read this in [English](README_en.md)
|
|||
|
||||
+ OS: Ubuntu 22.04
|
||||
+ Memory: 512GB
|
||||
+ Python: 3.12.3
|
||||
+ Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3 目前需要使用 git 源码安装 nltk)
|
||||
+ CUDA Version: 12.3
|
||||
+ GPU Driver: 535.104.05
|
||||
+ GPU: NVIDIA A100-SXM4-80GB * 8
|
||||
|
|
|
@ -12,7 +12,7 @@ Test hardware information:
|
|||
|
||||
+ OS: Ubuntu 22.04
|
||||
+ Memory: 512GB
|
||||
+ Python: 3.12.3
|
||||
+ Python: Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python 3.12.3)
|
||||
+ CUDA Version: 12.3
|
||||
+ GPU Driver: 535.104.05
|
||||
+ GPU: NVIDIA A100-SXM4-80GB * 8
|
||||
|
|
Loading…
Reference in New Issue