Sending Requests

Contents

Sending Requests#

This section provides a quick-start guide for using RTP-LLM in chat completions after successfully installing RTP-LLM.

For multimodal models, refer to OpenAI APIs - Vision.
For embedding models, refer to OpenAI APIs - Embedding and Encode (embedding model).

Launch A Server#

[ ]:

import socket
import subprocess
import time
import logging
import psutil
import requests
import json
from rtp_llm.utils.util import wait_sever_done, stop_server
port=8090
server_process = subprocess.Popen(
        ["/opt/conda310/bin/python", "-m", "rtp_llm.start_server",
         "--checkpoint_path=/mnt/nas1/hf/models--Qwen--Qwen1.5-0.5B-Chat/snapshots/6114e9c18dac0042fa90925f03b046734369472f/",
         "--model_type=qwen_2",
         f"--start_port={port}"
         ]
    )
wait_sever_done(server_process, port)

Using cURL#

[ ]:

import subprocess, json
port=8090
curl_command = f"""
curl -s http://localhost:{port}/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"messages": [{"role": "user", "content": "What is the capital of France?"}]}'
"""

response = json.loads(subprocess.check_output(curl_command, shell=True))
print(f"Output: {response.json()}")

Using Python Requests#

[ ]:

import requests
port=8090
url = f"http://localhost:{port}/v1/chat/completions"
json_data = {
     "messages": [
          {
               "role": "user",
               "content": "What is the capital of France?"
          }
     ]
}

response = requests.post(url, json=json_data)
print(f"Output 0: {response.json()}")

Using OpenAI Python Client#

[ ]:

import openai
port=8090
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1/chat/completions", api_key="None")

response = client.chat.completions.create(
    model="qwen/qwen2.5-0.5b-instruct",
    messages=[
        {"role": "user", "content": "List 3 countries and their capitals."},
    ],
    temperature=0,
    max_tokens=64,
)
print(response)

Streaming#

[ ]:

import openai
port=8090
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1/chat/completions", api_key="None")

# Use stream=True for streaming responses
response = client.chat.completions.create(
    model="qwen/qwen2.5-0.5b-instruct",
    messages=[
        {"role": "user", "content": "List 3 countries and their capitals."},
    ],
    temperature=0,
    max_tokens=64,
    stream=True,
)

# Handle the streaming output
for chunk in response:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)

Using Native Generation APIs#

You can also use the native /generate endpoint with requests, which provides more flexibility. An API reference is available at Sampling Parameters.

[ ]:

import requests
port=8090
response = requests.post(
    f"http://localhost:{port}/",
    json={
        "prompt": "The capital of France is",
        "generate_config": {
            "temperature": 0,
            "max_new_tokens": 32,
        },
        "yield_generator": False
    },
    stream=True,
)

print(f"Output 0: {response.json()}")

Streaming#

[ ]:

import requests, json
from typing_extensions import Literal
port=8090
response = requests.post(
    f"http://localhost:{port}/",
    json={
        "prompt": "The capital of France is",
        "generate_config": {
            "temperature": 0,
            "max_new_tokens": 32,
        },
        "yield_generator": True
    },
    stream=True,
)

prev: Literal[0] = 0
for chunk in response.iter_lines(decode_unicode=False):
    chunk = chunk.decode("utf-8")
    if chunk and chunk.startswith("data:"):
        if chunk == "data:[done]":
            break
        data = json.loads(chunk[5:].strip("\n"))
        output = data["response"]
        print(output[prev:], end="", flush=True)
        prev = len(output)

[ ]:

stop_server(server_process)