Documentation Index
Fetch the complete documentation index at: https://mintlify.com/QwenLM/Qwen/llms.txt
Use this file to discover all available pages before exploring further.
Overview
Streaming allows you to receive model responses incrementally as they’re generated, rather than waiting for the complete response. This is implemented using Server-Sent Events (SSE) and is useful for real-time user interfaces.
Enabling Streaming
Set stream: true in your chat completions request:
import requests
import json
url = "http://localhost:8000/v1/chat/completions"
data = {
"model": "gpt-3.5-turbo",
"messages": [
{"role": "user", "content": "Count from 1 to 10"}
],
"stream": True
}
response = requests.post(url, json=data, stream=True)
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data_str = line[6:]
if data_str == '[DONE]':
break
chunk = json.loads(data_str)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
print(delta['content'], end='', flush=True)
Initial Chunk
The first chunk contains the role:
data: {
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {
"role": "assistant"
},
"finish_reason": null
}
],
"created": 1677652288
}
Content Chunks
Subsequent chunks contain content deltas:
data: {
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {
"content": "Hello"
},
"finish_reason": null
}
]
}
Final Chunk
The last chunk has an empty delta and a finish_reason:
data: {
"model": "gpt-3.5-turbo",
"object": "chat.completion.chunk",
"choices": [
{
"index": 0,
"delta": {},
"finish_reason": "stop"
}
]
}
Stream Termination
Response Fields
Always "chat.completion.chunk" for streaming responses
Incremental content update:
role: Present in first chunk only
content: Text content delta (not cumulative)
Null during generation, then one of:
"stop": Natural completion
"length": Reached max_length
Python Client Examples
Basic Streaming
import requests
import json
def stream_chat(messages):
url = "http://localhost:8000/v1/chat/completions"
response = requests.post(
url,
json={
"model": "gpt-3.5-turbo",
"messages": messages,
"stream": True
},
stream=True
)
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
print() # New line at end
break
chunk = json.loads(data)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
print(delta['content'], end='', flush=True)
messages = [
{"role": "user", "content": "Write a haiku about programming"}
]
stream_chat(messages)
With OpenAI SDK
import openai
openai.api_base = "http://localhost:8000/v1"
openai.api_key = "none"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Tell me a story"}
],
stream=True
)
for chunk in response:
delta = chunk.choices[0].delta
if hasattr(delta, 'content'):
print(delta.content, end='', flush=True)
print() # New line
Collecting Full Response
import requests
import json
def stream_and_collect(messages):
url = "http://localhost:8000/v1/chat/completions"
response = requests.post(
url,
json={
"model": "gpt-3.5-turbo",
"messages": messages,
"stream": True
},
stream=True
)
full_response = ""
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
break
chunk = json.loads(data)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
content = delta['content']
full_response += content
print(content, end='', flush=True)
print() # New line
return full_response
response = stream_and_collect([
{"role": "user", "content": "Explain async programming"}
])
print(f"\nFull response length: {len(response)} characters")
JavaScript Example
async function streamChat(messages) {
const response = await fetch('http://localhost:8000/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: 'gpt-3.5-turbo',
messages: messages,
stream: true,
}),
});
const reader = response.body.getReader();
const decoder = new TextDecoder('utf-8');
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop();
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') {
return;
}
const chunk = JSON.parse(data);
const delta = chunk.choices[0].delta;
if (delta.content) {
process.stdout.write(delta.content);
}
}
}
}
}
streamChat([
{ role: 'user', content: 'Write a poem about AI' }
]);
Limitations
Function Calling Not Supported
Streaming does not support function calling:
# This will return HTTP 400
response = requests.post(
"http://localhost:8000/v1/chat/completions",
json={
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hello"}],
"stream": True,
"functions": [{"name": "test", "parameters": {}}] # Not allowed
}
)
Error response:
{
"detail": "Invalid request: Function calling is not yet implemented for stream mode."
}
Best Practices
Buffer Management
Handle partial lines in streams:
def stream_with_buffer(response):
buffer = ""
for chunk in response.iter_content(decode_unicode=True):
buffer += chunk
while '\n' in buffer:
line, buffer = buffer.split('\n', 1)
if line.startswith('data: '):
process_line(line)
Error Handling
import requests
import json
def safe_stream(messages):
try:
response = requests.post(
"http://localhost:8000/v1/chat/completions",
json={
"model": "gpt-3.5-turbo",
"messages": messages,
"stream": True
},
stream=True,
timeout=30
)
response.raise_for_status()
for line in response.iter_lines():
if line:
line = line.decode('utf-8')
if line.startswith('data: '):
data = line[6:]
if data == '[DONE]':
break
try:
chunk = json.loads(data)
delta = chunk['choices'][0]['delta']
if 'content' in delta:
yield delta['content']
except json.JSONDecodeError:
print(f"Warning: Invalid JSON: {data}")
continue
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return
for content in safe_stream([{"role": "user", "content": "Hello"}]):
print(content, end='', flush=True)
Stop Word Handling
The streaming implementation includes a delay buffer to properly handle stop words. The last few tokens may be held back temporarily to check for stop sequences before being yielded.