Sending Requests#
This section provides a quick-start guide for using RTP-LLM in chat completions after successfully installing RTP-LLM.
For multimodal models, refer to OpenAI APIs - Vision.
For embedding models, refer to OpenAI APIs - Embedding and Encode (embedding model).
Launch A Server#
[ ]:
import socket
import subprocess
import time
import logging
import psutil
import requests
import json
from rtp_llm.utils.util import wait_sever_done, stop_server
port=8090
server_process = subprocess.Popen(
["/opt/conda310/bin/python", "-m", "rtp_llm.start_server",
"--checkpoint_path=/mnt/nas1/hf/models--Qwen--Qwen1.5-0.5B-Chat/snapshots/6114e9c18dac0042fa90925f03b046734369472f/",
"--model_type=qwen_2",
f"--start_port={port}"
]
)
wait_sever_done(server_process, port)
Using cURL#
[ ]:
import subprocess, json
port=8090
curl_command = f"""
curl -s http://localhost:{port}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{"messages": [{"role": "user", "content": "What is the capital of France?"}]}'
"""
response = json.loads(subprocess.check_output(curl_command, shell=True))
print(f"Output: {response.json()}")
Using Python Requests#
[ ]:
import requests
port=8090
url = f"http://localhost:{port}/v1/chat/completions"
json_data = {
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}
response = requests.post(url, json=json_data)
print(f"Output 0: {response.json()}")
Using OpenAI Python Client#
[ ]:
import openai
port=8090
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1/chat/completions", api_key="None")
response = client.chat.completions.create(
model="qwen/qwen2.5-0.5b-instruct",
messages=[
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=0,
max_tokens=64,
)
print(response)
Streaming#
[ ]:
import openai
port=8090
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1/chat/completions", api_key="None")
# Use stream=True for streaming responses
response = client.chat.completions.create(
model="qwen/qwen2.5-0.5b-instruct",
messages=[
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=0,
max_tokens=64,
stream=True,
)
# Handle the streaming output
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
Using Native Generation APIs#
You can also use the native /generate
endpoint with requests, which provides more flexibility. An API reference is available at Sampling Parameters.
[ ]:
import requests
port=8090
response = requests.post(
f"http://localhost:{port}/",
json={
"prompt": "The capital of France is",
"generate_config": {
"temperature": 0,
"max_new_tokens": 32,
},
"yield_generator": False
},
stream=True,
)
print(f"Output 0: {response.json()}")
Streaming#
[ ]:
import requests, json
from typing_extensions import Literal
port=8090
response = requests.post(
f"http://localhost:{port}/",
json={
"prompt": "The capital of France is",
"generate_config": {
"temperature": 0,
"max_new_tokens": 32,
},
"yield_generator": True
},
stream=True,
)
prev: Literal[0] = 0
for chunk in response.iter_lines(decode_unicode=False):
chunk = chunk.decode("utf-8")
if chunk and chunk.startswith("data:"):
if chunk == "data:[done]":
break
data = json.loads(chunk[5:].strip("\n"))
output = data["response"]
print(output[prev:], end="", flush=True)
prev = len(output)
[ ]:
stop_server(server_process)