更新部分说明

This commit is contained in:
zR 2024-06-08 13:26:43 +08:00
parent 97c8bf8f45
commit 20a9f26ec6
7 changed files with 35 additions and 109 deletions

View File

@ -16,7 +16,7 @@ Read this in [English](README_en.md).
+ OS: Ubuntu 22.04 + OS: Ubuntu 22.04
+ Memory: 512GB + Memory: 512GB
+ Python: 3.12.3 + Python: 3.10.12 / 3.12.3 均已测试
+ CUDA Version: 12.3 + CUDA Version: 12.3
+ GPU Driver: 535.104.05 + GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8 + GPU: NVIDIA A100-SXM4-80GB * 8

View File

@ -16,7 +16,7 @@ Test hardware information:
+ OS: Ubuntu 22.04 + OS: Ubuntu 22.04
+ Memory: 512GB + Memory: 512GB
+ Python: 3.12.3 + Python: 3.10.12 / 3.12.3 have been tested
+ CUDA Version: 12.3 + CUDA Version: 12.3
+ GPU Driver: 535.104.05 + GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8 + GPU: NVIDIA A100-SXM4-80GB * 8

View File

@ -8,8 +8,8 @@ base_url = "http://127.0.0.1:8000/v1/"
client = OpenAI(api_key="EMPTY", base_url=base_url) client = OpenAI(api_key="EMPTY", base_url=base_url)
def function_chat(): def function_chat(use_stream=False):
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}] messages = [{"role": "user", "content": "What's the Celsius temperature in San Francisco?"}]
tools = [ tools = [
{ {
"type": "function", "type": "function",
@ -47,17 +47,24 @@ def function_chat():
model="glm-4", model="glm-4",
messages=messages, messages=messages,
tools=tools, tools=tools,
stream=False, # must use False stream=use_stream,
max_tokens=256,
temperature=0.9,
presence_penalty=1.2,
top_p=0.1,
tool_choice="auto", # use "auto" to let the model choose the tool automatically tool_choice="auto", # use "auto" to let the model choose the tool automatically
# tool_choice={"type": "function", "function": {"name": "my_function"}}, # tool_choice={"type": "function", "function": {"name": "my_function"}},
) )
if response: if response:
print(response.choices[0].message) if use_stream:
for chunk in response:
print(chunk)
else:
print(response)
else: else:
print("Error:", response.status_code) print("Error:", response.status_code)
def simple_chat(use_stream=False): def simple_chat(use_stream=False):
messages = [ messages = [
{ {
@ -74,20 +81,20 @@ def simple_chat(use_stream=False):
messages=messages, messages=messages,
stream=use_stream, stream=use_stream,
max_tokens=256, max_tokens=256,
temperature=0.1, temperature=0.4,
presence_penalty=1.1, presence_penalty=1.2,
top_p=0.8) top_p=0.8,
)
if response: if response:
if use_stream: if use_stream:
for chunk in response: for chunk in response:
print(chunk.choices[0].delta.content) print(chunk)
else: else:
content = response.choices[0].message.content print(response)
print(content)
else: else:
print("Error:", response.status_code) print("Error:", response.status_code)
if __name__ == "__main__": if __name__ == "__main__":
# simple_chat(use_stream=False) simple_chat(use_stream=True)
function_chat() # function_chat(use_stream=False) # Only False is supported

View File

@ -286,7 +286,6 @@ def process_messages(messages, tools=None, tool_choice="none"):
if m.role == 'system': if m.role == 'system':
messages.insert(0, {"role": m.role, "content": m.content}) messages.insert(0, {"role": m.role, "content": m.content})
break break
return messages return messages
@ -334,19 +333,12 @@ async def create_chat_completion(request: ChatCompletionRequest):
except: except:
logger.warning("Failed to parse tool call") logger.warning("Failed to parse tool call")
# CallFunction
if isinstance(function_call, dict): if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call) function_call = FunctionCallResponse(**function_call)
tool_response = "" generate = parse_output_text(request.model, output, function_call=function_call)
if not gen_params.get("messages"):
gen_params["messages"] = []
gen_params["messages"].append(ChatMessage(role="assistant", content=output))
gen_params["messages"].append(ChatMessage(role="tool", name=function_call.name, content=tool_response))
generate = predict(request.model, gen_params)
return EventSourceResponse(generate, media_type="text/event-stream") return EventSourceResponse(generate, media_type="text/event-stream")
else: else:
generate = parse_output_text(request.model, output) return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
return EventSourceResponse(generate, media_type="text/event-stream")
response = "" response = ""
async for response in generate_stream_glm4(gen_params): async for response in generate_stream_glm4(gen_params):
@ -405,77 +397,6 @@ async def create_chat_completion(request: ChatCompletionRequest):
usage=usage usage=usage
) )
async def predict(model_id: str, params: dict):
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
previous_text = ""
async for new_response in generate_stream_glm4(params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(previous_text):]
previous_text = decoded_unicode
finish_reason = new_response["finish_reason"]
if len(delta_text) == 0 and finish_reason != "tool_calls":
continue
function_call = None
if finish_reason == "tool_calls":
try:
function_call = process_response(decoded_unicode, use_tool=True)
except:
logger.warning(
"Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
delta = DeltaMessage(
content=None,
role="assistant",
function_call=None,
tool_calls=[{
"id": f"call_{int(time.time() * 1000)}",
"index": 0,
"type": "function",
"function": function_call
}] if isinstance(function_call, FunctionCallResponse) else None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=delta,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
async def predict_stream(model_id, gen_params): async def predict_stream(model_id, gen_params):
output = "" output = ""
is_function_call = False is_function_call = False
@ -537,24 +458,20 @@ async def predict_stream(model_id, gen_params):
yield '[DONE]' yield '[DONE]'
async def parse_output_text(model_id: str, value: str): async def parse_output_text(model_id: str, value: str, function_call: FunctionCallResponse = None):
delta = DeltaMessage(role="assistant", content=value)
if function_call is not None:
delta.function_call = function_call
choice_data = ChatCompletionResponseStreamChoice( choice_data = ChatCompletionResponseStreamChoice(
index=0, index=0,
delta=DeltaMessage(role="assistant", content=value), delta=delta,
finish_reason=None finish_reason=None
) )
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk") chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True)) yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]' yield '[DONE]'
if __name__ == "__main__": if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
@ -563,7 +480,8 @@ if __name__ == "__main__":
tensor_parallel_size=1, tensor_parallel_size=1,
dtype="bfloat16", dtype="bfloat16",
trust_remote_code=True, trust_remote_code=True,
gpu_memory_utilization=0.3, # 占用显存的比例请根据你的显卡显存大小设置合适的值例如如果你的显卡有80G您只想使用24G请按照24/80=0.3设置
gpu_memory_utilization=0.9,
enforce_eager=True, enforce_eager=True,
worker_use_ray=False, worker_use_ray=False,
engine_use_ray=False, engine_use_ray=False,

View File

@ -1,3 +1,4 @@
# Please install the requirments.txt in basic_demo first!
# use vllm # use vllm
# vllm>=0.4.3 # vllm>=0.4.3

View File

@ -11,7 +11,7 @@ Read this in [English](README_en.md)
+ OS: Ubuntu 22.04 + OS: Ubuntu 22.04
+ Memory: 512GB + Memory: 512GB
+ Python: 3.12.3 + Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3 目前需要使用 git 源码安装 nltk)
+ CUDA Version: 12.3 + CUDA Version: 12.3
+ GPU Driver: 535.104.05 + GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8 + GPU: NVIDIA A100-SXM4-80GB * 8

View File

@ -12,7 +12,7 @@ Test hardware information:
+ OS: Ubuntu 22.04 + OS: Ubuntu 22.04
+ Memory: 512GB + Memory: 512GB
+ Python: 3.12.3 + Python: Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python 3.12.3)
+ CUDA Version: 12.3 + CUDA Version: 12.3
+ GPU Driver: 535.104.05 + GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8 + GPU: NVIDIA A100-SXM4-80GB * 8