更新部分说明

This commit is contained in:
zR 2024-06-08 13:26:43 +08:00
parent 97c8bf8f45
commit 20a9f26ec6
7 changed files with 35 additions and 109 deletions

View File

@ -16,7 +16,7 @@ Read this in [English](README_en.md).
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 均已测试
+ CUDA Version: 12.3
+ GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8

View File

@ -16,7 +16,7 @@ Test hardware information:
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 have been tested
+ CUDA Version: 12.3
+ GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8

View File

@ -8,8 +8,8 @@ base_url = "http://127.0.0.1:8000/v1/"
client = OpenAI(api_key="EMPTY", base_url=base_url)
def function_chat():
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
def function_chat(use_stream=False):
messages = [{"role": "user", "content": "What's the Celsius temperature in San Francisco?"}]
tools = [
{
"type": "function",
@ -47,17 +47,24 @@ def function_chat():
model="glm-4",
messages=messages,
tools=tools,
stream=False, # must use False
stream=use_stream,
max_tokens=256,
temperature=0.9,
presence_penalty=1.2,
top_p=0.1,
tool_choice="auto", # use "auto" to let the model choose the tool automatically
# tool_choice={"type": "function", "function": {"name": "my_function"}},
)
if response:
print(response.choices[0].message)
if use_stream:
for chunk in response:
print(chunk)
else:
print(response)
else:
print("Error:", response.status_code)
def simple_chat(use_stream=False):
messages = [
{
@ -74,20 +81,20 @@ def simple_chat(use_stream=False):
messages=messages,
stream=use_stream,
max_tokens=256,
temperature=0.1,
presence_penalty=1.1,
top_p=0.8)
temperature=0.4,
presence_penalty=1.2,
top_p=0.8,
)
if response:
if use_stream:
for chunk in response:
print(chunk.choices[0].delta.content)
print(chunk)
else:
content = response.choices[0].message.content
print(content)
print(response)
else:
print("Error:", response.status_code)
if __name__ == "__main__":
# simple_chat(use_stream=False)
function_chat()
simple_chat(use_stream=True)
# function_chat(use_stream=False) # Only False is supported

View File

@ -286,7 +286,6 @@ def process_messages(messages, tools=None, tool_choice="none"):
if m.role == 'system':
messages.insert(0, {"role": m.role, "content": m.content})
break
return messages
@ -334,19 +333,12 @@ async def create_chat_completion(request: ChatCompletionRequest):
except:
logger.warning("Failed to parse tool call")
# CallFunction
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
tool_response = ""
if not gen_params.get("messages"):
gen_params["messages"] = []
gen_params["messages"].append(ChatMessage(role="assistant", content=output))
gen_params["messages"].append(ChatMessage(role="tool", name=function_call.name, content=tool_response))
generate = predict(request.model, gen_params)
generate = parse_output_text(request.model, output, function_call=function_call)
return EventSourceResponse(generate, media_type="text/event-stream")
else:
generate = parse_output_text(request.model, output)
return EventSourceResponse(generate, media_type="text/event-stream")
return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")
response = ""
async for response in generate_stream_glm4(gen_params):
@ -405,77 +397,6 @@ async def create_chat_completion(request: ChatCompletionRequest):
usage=usage
)
async def predict(model_id: str, params: dict):
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
previous_text = ""
async for new_response in generate_stream_glm4(params):
decoded_unicode = new_response["text"]
delta_text = decoded_unicode[len(previous_text):]
previous_text = decoded_unicode
finish_reason = new_response["finish_reason"]
if len(delta_text) == 0 and finish_reason != "tool_calls":
continue
function_call = None
if finish_reason == "tool_calls":
try:
function_call = process_response(decoded_unicode, use_tool=True)
except:
logger.warning(
"Failed to parse tool call, maybe the response is not a tool call or have been answered.")
if isinstance(function_call, dict):
function_call = FunctionCallResponse(**function_call)
delta = DeltaMessage(
content=None,
role="assistant",
function_call=None,
tool_calls=[{
"id": f"call_{int(time.time() * 1000)}",
"index": 0,
"type": "function",
"function": function_call
}] if isinstance(function_call, FunctionCallResponse) else None,
)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=delta,
finish_reason=finish_reason
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(
model=model_id,
id="",
choices=[choice_data],
object="chat.completion.chunk"
)
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
async def predict_stream(model_id, gen_params):
output = ""
is_function_call = False
@ -537,24 +458,20 @@ async def predict_stream(model_id, gen_params):
yield '[DONE]'
async def parse_output_text(model_id: str, value: str):
async def parse_output_text(model_id: str, value: str, function_call: FunctionCallResponse = None):
delta = DeltaMessage(role="assistant", content=value)
if function_call is not None:
delta.function_call = function_call
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant", content=value),
delta=delta,
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.model_dump_json(exclude_unset=True))
yield '[DONE]'
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
engine_args = AsyncEngineArgs(
@ -563,7 +480,8 @@ if __name__ == "__main__":
tensor_parallel_size=1,
dtype="bfloat16",
trust_remote_code=True,
gpu_memory_utilization=0.3,
# 占用显存的比例请根据你的显卡显存大小设置合适的值例如如果你的显卡有80G您只想使用24G请按照24/80=0.3设置
gpu_memory_utilization=0.9,
enforce_eager=True,
worker_use_ray=False,
engine_use_ray=False,

View File

@ -1,3 +1,4 @@
# Please install the requirments.txt in basic_demo first!
# use vllm
# vllm>=0.4.3

View File

@ -11,7 +11,7 @@ Read this in [English](README_en.md)
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ Python: 3.10.12 / 3.12.3 (如果您使用 Python 3.12.3 目前需要使用 git 源码安装 nltk)
+ CUDA Version: 12.3
+ GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8

View File

@ -12,7 +12,7 @@ Test hardware information:
+ OS: Ubuntu 22.04
+ Memory: 512GB
+ Python: 3.12.3
+ Python: Python: 3.10.12 / 3.12.3 (Currently, you need to install nltk from the git source code if you use Python 3.12.3)
+ CUDA Version: 12.3
+ GPU Driver: 535.104.05
+ GPU: NVIDIA A100-SXM4-80GB * 8