Skip to main content

Documentation Index

Fetch the complete documentation index at: https://mintlify.com/QwenLM/Qwen/llms.txt

Use this file to discover all available pages before exploring further.

Overview

Streaming allows you to receive model responses incrementally as they’re generated, rather than waiting for the complete response. This is implemented using Server-Sent Events (SSE) and is useful for real-time user interfaces.

Enabling Streaming

Set stream: true in your chat completions request:
import requests
import json

url = "http://localhost:8000/v1/chat/completions"
data = {
    "model": "gpt-3.5-turbo",
    "messages": [
        {"role": "user", "content": "Count from 1 to 10"}
    ],
    "stream": True
}

response = requests.post(url, json=data, stream=True)

for line in response.iter_lines():
    if line:
        line = line.decode('utf-8')
        if line.startswith('data: '):
            data_str = line[6:]
            if data_str == '[DONE]':
                break
            chunk = json.loads(data_str)
            delta = chunk['choices'][0]['delta']
            if 'content' in delta:
                print(delta['content'], end='', flush=True)

Stream Response Format

Initial Chunk

The first chunk contains the role:
data: {
  "model": "gpt-3.5-turbo",
  "object": "chat.completion.chunk",
  "choices": [
    {
      "index": 0,
      "delta": {
        "role": "assistant"
      },
      "finish_reason": null
    }
  ],
  "created": 1677652288
}

Content Chunks

Subsequent chunks contain content deltas:
data: {
  "model": "gpt-3.5-turbo",
  "object": "chat.completion.chunk",
  "choices": [
    {
      "index": 0,
      "delta": {
        "content": "Hello"
      },
      "finish_reason": null
    }
  ]
}

Final Chunk

The last chunk has an empty delta and a finish_reason:
data: {
  "model": "gpt-3.5-turbo",
  "object": "chat.completion.chunk",
  "choices": [
    {
      "index": 0,
      "delta": {},
      "finish_reason": "stop"
    }
  ]
}

Stream Termination

data: [DONE]

Response Fields

object
string
Always "chat.completion.chunk" for streaming responses
choices[].delta
object
Incremental content update:
  • role: Present in first chunk only
  • content: Text content delta (not cumulative)
choices[].finish_reason
string | null
Null during generation, then one of:
  • "stop": Natural completion
  • "length": Reached max_length

Python Client Examples

Basic Streaming

import requests
import json

def stream_chat(messages):
    url = "http://localhost:8000/v1/chat/completions"
    response = requests.post(
        url,
        json={
            "model": "gpt-3.5-turbo",
            "messages": messages,
            "stream": True
        },
        stream=True
    )
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data = line[6:]
                if data == '[DONE]':
                    print()  # New line at end
                    break
                
                chunk = json.loads(data)
                delta = chunk['choices'][0]['delta']
                
                if 'content' in delta:
                    print(delta['content'], end='', flush=True)

messages = [
    {"role": "user", "content": "Write a haiku about programming"}
]

stream_chat(messages)

With OpenAI SDK

import openai

openai.api_base = "http://localhost:8000/v1"
openai.api_key = "none"

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": "Tell me a story"}
    ],
    stream=True
)

for chunk in response:
    delta = chunk.choices[0].delta
    if hasattr(delta, 'content'):
        print(delta.content, end='', flush=True)

print()  # New line

Collecting Full Response

import requests
import json

def stream_and_collect(messages):
    url = "http://localhost:8000/v1/chat/completions"
    response = requests.post(
        url,
        json={
            "model": "gpt-3.5-turbo",
            "messages": messages,
            "stream": True
        },
        stream=True
    )
    
    full_response = ""
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data = line[6:]
                if data == '[DONE]':
                    break
                
                chunk = json.loads(data)
                delta = chunk['choices'][0]['delta']
                
                if 'content' in delta:
                    content = delta['content']
                    full_response += content
                    print(content, end='', flush=True)
    
    print()  # New line
    return full_response

response = stream_and_collect([
    {"role": "user", "content": "Explain async programming"}
])
print(f"\nFull response length: {len(response)} characters")

JavaScript Example

async function streamChat(messages) {
  const response = await fetch('http://localhost:8000/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: 'gpt-3.5-turbo',
      messages: messages,
      stream: true,
    }),
  });

  const reader = response.body.getReader();
  const decoder = new TextDecoder('utf-8');
  let buffer = '';

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;

    buffer += decoder.decode(value, { stream: true });
    const lines = buffer.split('\n');
    buffer = lines.pop();

    for (const line of lines) {
      if (line.startsWith('data: ')) {
        const data = line.slice(6);
        if (data === '[DONE]') {
          return;
        }

        const chunk = JSON.parse(data);
        const delta = chunk.choices[0].delta;
        if (delta.content) {
          process.stdout.write(delta.content);
        }
      }
    }
  }
}

streamChat([
  { role: 'user', content: 'Write a poem about AI' }
]);

Limitations

Function Calling Not Supported

Streaming does not support function calling:
# This will return HTTP 400
response = requests.post(
    "http://localhost:8000/v1/chat/completions",
    json={
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": "Hello"}],
        "stream": True,
        "functions": [{"name": "test", "parameters": {}}]  # Not allowed
    }
)
Error response:
{
  "detail": "Invalid request: Function calling is not yet implemented for stream mode."
}

Best Practices

Buffer Management

Handle partial lines in streams:
def stream_with_buffer(response):
    buffer = ""
    
    for chunk in response.iter_content(decode_unicode=True):
        buffer += chunk
        
        while '\n' in buffer:
            line, buffer = buffer.split('\n', 1)
            if line.startswith('data: '):
                process_line(line)

Error Handling

import requests
import json

def safe_stream(messages):
    try:
        response = requests.post(
            "http://localhost:8000/v1/chat/completions",
            json={
                "model": "gpt-3.5-turbo",
                "messages": messages,
                "stream": True
            },
            stream=True,
            timeout=30
        )
        response.raise_for_status()
        
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8')
                if line.startswith('data: '):
                    data = line[6:]
                    if data == '[DONE]':
                        break
                    
                    try:
                        chunk = json.loads(data)
                        delta = chunk['choices'][0]['delta']
                        if 'content' in delta:
                            yield delta['content']
                    except json.JSONDecodeError:
                        print(f"Warning: Invalid JSON: {data}")
                        continue
    
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return

for content in safe_stream([{"role": "user", "content": "Hello"}]):
    print(content, end='', flush=True)

Stop Word Handling

The streaming implementation includes a delay buffer to properly handle stop words. The last few tokens may be held back temporarily to check for stop sequences before being yielded.