Ollama REST API (requests)¶
This notebook demonstrates the Ollama REST API using the requests library.
Features Covered¶
- List models
- Show model details
- List running models
- Generate response
- Chat completion
- Streaming responses
- Generate embeddings
- Copy and delete models
Prerequisites¶
- Ollama pod running:
ujust ollama start - Model pulled:
ujust ollama pull llama3.2
1. Setup & Configuration¶
In [22]:
Copied!
import os
import requests
import json
import time
# === Configuration ===
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434")
DEFAULT_MODEL = "llama3.2:latest"
print(f"Ollama host: {OLLAMA_HOST}")
print(f"Default model: {DEFAULT_MODEL}")
import os import requests import json import time # === Configuration === OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://ollama:11434") DEFAULT_MODEL = "llama3.2:latest" print(f"Ollama host: {OLLAMA_HOST}") print(f"Default model: {DEFAULT_MODEL}")
Out[22]:
Ollama host: http://ollama:11434 Default model: llama3.2:latest
2. Connection Health Check¶
In [23]:
Copied!
def check_ollama_health() -> tuple[bool, bool]:
"""Check if Ollama server is running and model is available.
Returns:
tuple: (server_healthy, model_available)
"""
try:
response = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=5)
if response.status_code == 200:
print("✓ Ollama server is running!")
models = response.json()
model_names = [m.get("name", "") for m in models.get("models", [])]
if DEFAULT_MODEL in model_names:
print(f"✓ Model '{DEFAULT_MODEL}' is available")
return True, True
else:
print(f"✗ Model '{DEFAULT_MODEL}' not found!")
print()
if model_names:
print("Available models:")
for name in model_names:
print(f" - {name}")
else:
print("No models installed.")
print()
print("To fix this, run:")
print(f" ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
return True, False
else:
print(f"Ollama returned unexpected status: {response.status_code}")
return False, False
except requests.exceptions.ConnectionError:
print("✗ Cannot connect to Ollama server!")
print("To fix this, run: ujust ollama start")
return False, False
except requests.exceptions.Timeout:
print("✗ Connection to Ollama timed out!")
return False, False
ollama_healthy, model_available = check_ollama_health()
def check_ollama_health() -> tuple[bool, bool]: """Check if Ollama server is running and model is available. Returns: tuple: (server_healthy, model_available) """ try: response = requests.get(f"{OLLAMA_HOST}/api/tags", timeout=5) if response.status_code == 200: print("✓ Ollama server is running!") models = response.json() model_names = [m.get("name", "") for m in models.get("models", [])] if DEFAULT_MODEL in model_names: print(f"✓ Model '{DEFAULT_MODEL}' is available") return True, True else: print(f"✗ Model '{DEFAULT_MODEL}' not found!") print() if model_names: print("Available models:") for name in model_names: print(f" - {name}") else: print("No models installed.") print() print("To fix this, run:") print(f" ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") return True, False else: print(f"Ollama returned unexpected status: {response.status_code}") return False, False except requests.exceptions.ConnectionError: print("✗ Cannot connect to Ollama server!") print("To fix this, run: ujust ollama start") return False, False except requests.exceptions.Timeout: print("✗ Connection to Ollama timed out!") return False, False ollama_healthy, model_available = check_ollama_health()
Out[23]:
✓ Ollama server is running! ✓ Model 'llama3.2:latest' is available
3. List Models¶
Endpoint: GET /api/tags
In [24]:
Copied!
print("=== List Available Models ===")
response = requests.get(f"{OLLAMA_HOST}/api/tags")
models_data = response.json()
if models_data.get("models"):
for model in models_data["models"]:
size_gb = model.get("size", 0) / (1024**3)
print(f" - {model['name']} ({size_gb:.2f} GB)")
else:
print(" No models found. Run: ujust ollama pull llama3.2")
print("=== List Available Models ===") response = requests.get(f"{OLLAMA_HOST}/api/tags") models_data = response.json() if models_data.get("models"): for model in models_data["models"]: size_gb = model.get("size", 0) / (1024**3) print(f" - {model['name']} ({size_gb:.2f} GB)") else: print(" No models found. Run: ujust ollama pull llama3.2")
Out[24]:
=== List Available Models === - hf.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF:Q4_K_M (4.07 GB) - llama3.2:latest (1.88 GB)
4. Show Model Details¶
Endpoint: POST /api/show
In [25]:
Copied!
print("=== Show Model Details ===")
if not model_available:
print()
print("⚠ Skipping - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
response = requests.post(
f"{OLLAMA_HOST}/api/show",
json={"model": DEFAULT_MODEL}
)
model_info = response.json()
if "error" in model_info:
print(f"✗ Error: {model_info['error']}")
else:
print(f"Model: {DEFAULT_MODEL}")
print(f"\nDetails:")
if "details" in model_info:
details = model_info["details"]
print(f" Family: {details.get('family', 'N/A')}")
print(f" Parameter Size: {details.get('parameter_size', 'N/A')}")
print(f" Quantization: {details.get('quantization_level', 'N/A')}")
print(f"\nTemplate preview:")
template = model_info.get("template", "N/A")
print(f" {template[:200]}..." if len(template) > 200 else f" {template}")
print("=== Show Model Details ===") if not model_available: print() print("⚠ Skipping - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: response = requests.post( f"{OLLAMA_HOST}/api/show", json={"model": DEFAULT_MODEL} ) model_info = response.json() if "error" in model_info: print(f"✗ Error: {model_info['error']}") else: print(f"Model: {DEFAULT_MODEL}") print(f"\nDetails:") if "details" in model_info: details = model_info["details"] print(f" Family: {details.get('family', 'N/A')}") print(f" Parameter Size: {details.get('parameter_size', 'N/A')}") print(f" Quantization: {details.get('quantization_level', 'N/A')}") print(f"\nTemplate preview:") template = model_info.get("template", "N/A") print(f" {template[:200]}..." if len(template) > 200 else f" {template}")
Out[25]:
=== Show Model Details ===
Model: llama3.2:latest
Details:
Family: llama
Parameter Size: 3.2B
Quantization: Q4_K_M
Template preview:
<|start_header_id|>system<|end_header_id|>
Cutting Knowledge Date: December 2023
{{ if .System }}{{ .System }}
{{- end }}
{{- if .Tools }}When you receive a tool call response, use the output to for...
5. List Running Models¶
Endpoint: GET /api/ps
In [26]:
Copied!
print("=== List Running Models ===")
response = requests.get(f"{OLLAMA_HOST}/api/ps")
running = response.json()
if running.get("models"):
for model in running["models"]:
name = model.get("name", "Unknown")
size = model.get("size", 0) / (1024**3)
vram = model.get("size_vram", 0) / (1024**3)
expires = model.get("expires_at", "N/A")
print(f" - {name}")
print(f" Size: {size:.2f} GB | VRAM: {vram:.2f} GB")
print(f" Expires: {expires}")
else:
print(" No models currently loaded in memory")
print("=== List Running Models ===") response = requests.get(f"{OLLAMA_HOST}/api/ps") running = response.json() if running.get("models"): for model in running["models"]: name = model.get("name", "Unknown") size = model.get("size", 0) / (1024**3) vram = model.get("size_vram", 0) / (1024**3) expires = model.get("expires_at", "N/A") print(f" - {name}") print(f" Size: {size:.2f} GB | VRAM: {vram:.2f} GB") print(f" Expires: {expires}") else: print(" No models currently loaded in memory")
Out[26]:
=== List Running Models ===
- llama3.2:latest
Size: 2.56 GB | VRAM: 2.56 GB
Expires: 2025-12-28T20:29:33.116691371Z
6. Generate Response¶
Endpoint: POST /api/generate
In [27]:
Copied!
print("=== Generate Response ===")
if not model_available:
print()
print("⚠ Skipping - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
prompt = "Why is the sky blue? Answer in one sentence."
print(f"Prompt: {prompt}")
print()
start_time = time.perf_counter()
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": DEFAULT_MODEL,
"prompt": prompt,
"stream": False
}
)
end_time = time.perf_counter()
result = response.json()
if "error" in result:
print(f"✗ Error: {result['error']}")
else:
print(f"Response: {result['response']}")
print()
print(f"Latency: {end_time - start_time:.2f}s")
print(f"Eval tokens: {result.get('eval_count', 'N/A')}")
print(f"Eval duration: {result.get('eval_duration', 0) / 1e9:.2f}s")
print("=== Generate Response ===") if not model_available: print() print("⚠ Skipping - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: prompt = "Why is the sky blue? Answer in one sentence." print(f"Prompt: {prompt}") print() start_time = time.perf_counter() response = requests.post( f"{OLLAMA_HOST}/api/generate", json={ "model": DEFAULT_MODEL, "prompt": prompt, "stream": False } ) end_time = time.perf_counter() result = response.json() if "error" in result: print(f"✗ Error: {result['error']}") else: print(f"Response: {result['response']}") print() print(f"Latency: {end_time - start_time:.2f}s") print(f"Eval tokens: {result.get('eval_count', 'N/A')}") print(f"Eval duration: {result.get('eval_duration', 0) / 1e9:.2f}s")
Out[27]:
=== Generate Response === Prompt: Why is the sky blue? Answer in one sentence.
Out[27]:
Response: The sky appears blue because of a phenomenon called Rayleigh scattering, where shorter wavelengths of light (such as blue and violet) are scattered more than longer wavelengths by the tiny molecules of gases in the Earth's atmosphere. Latency: 3.33s Eval tokens: 44 Eval duration: 0.17s
7. Chat Completion¶
Endpoint: POST /api/chat
In [28]:
Copied!
print("=== Chat Completion ===")
if not model_available:
print()
print("⚠ Skipping - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
messages = [
{"role": "system", "content": "You are a helpful assistant. Keep responses brief."},
{"role": "user", "content": "What is Python?"}
]
response = requests.post(
f"{OLLAMA_HOST}/api/chat",
json={
"model": DEFAULT_MODEL,
"messages": messages,
"stream": False
}
)
result = response.json()
if "error" in result:
print(f"✗ Error: {result['error']}")
else:
print(f"Assistant: {result['message']['content']}")
print("=== Chat Completion ===") if not model_available: print() print("⚠ Skipping - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: messages = [ {"role": "system", "content": "You are a helpful assistant. Keep responses brief."}, {"role": "user", "content": "What is Python?"} ] response = requests.post( f"{OLLAMA_HOST}/api/chat", json={ "model": DEFAULT_MODEL, "messages": messages, "stream": False } ) result = response.json() if "error" in result: print(f"✗ Error: {result['error']}") else: print(f"Assistant: {result['message']['content']}")
Out[28]:
=== Chat Completion ===
Out[28]:
Assistant: Python is a high-level, interpreted programming language that's easy to learn and use. It's known for its simplicity, readability, and versatility, making it a popular choice for web development, data analysis, artificial intelligence, and more.
8. Streaming Response¶
Endpoint: POST /api/generate with stream: true
In [29]:
Copied!
print("=== Streaming Response ===")
if not model_available:
print()
print("⚠ Skipping - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
print()
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": DEFAULT_MODEL,
"prompt": "Count from 1 to 5.",
"stream": True
},
stream=True
)
collected = []
for line in response.iter_lines():
if line:
chunk = json.loads(line)
if "error" in chunk:
print(f"✗ Error: {chunk['error']}")
break
token = chunk.get("response", "")
collected.append(token)
if chunk.get("done"):
break
if collected:
print(f"Response: {''.join(collected)}")
print("=== Streaming Response ===") if not model_available: print() print("⚠ Skipping - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: print() response = requests.post( f"{OLLAMA_HOST}/api/generate", json={ "model": DEFAULT_MODEL, "prompt": "Count from 1 to 5.", "stream": True }, stream=True ) collected = [] for line in response.iter_lines(): if line: chunk = json.loads(line) if "error" in chunk: print(f"✗ Error: {chunk['error']}") break token = chunk.get("response", "") collected.append(token) if chunk.get("done"): break if collected: print(f"Response: {''.join(collected)}")
Out[29]:
=== Streaming Response ===
Out[29]:
Response: Here we go: 1 2 3 4 5
9. Generate Embeddings¶
Endpoint: POST /api/embed
In [30]:
Copied!
print("=== Generate Embeddings ===")
if not model_available:
print()
print("⚠ Skipping - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
test_text = "Ollama makes running LLMs locally easy and efficient."
response = requests.post(
f"{OLLAMA_HOST}/api/embed",
json={
"model": DEFAULT_MODEL,
"input": test_text
}
)
result = response.json()
if "error" in result:
print(f"✗ Error: {result['error']}")
else:
embeddings = result.get("embeddings", [[]])[0]
print(f"Input: '{test_text}'")
print(f"Embedding dimensions: {len(embeddings)}")
print(f"First 5 values: {embeddings[:5]}")
print(f"Last 5 values: {embeddings[-5:]}")
print("=== Generate Embeddings ===") if not model_available: print() print("⚠ Skipping - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: test_text = "Ollama makes running LLMs locally easy and efficient." response = requests.post( f"{OLLAMA_HOST}/api/embed", json={ "model": DEFAULT_MODEL, "input": test_text } ) result = response.json() if "error" in result: print(f"✗ Error: {result['error']}") else: embeddings = result.get("embeddings", [[]])[0] print(f"Input: '{test_text}'") print(f"Embedding dimensions: {len(embeddings)}") print(f"First 5 values: {embeddings[:5]}") print(f"Last 5 values: {embeddings[-5:]}")
Out[30]:
=== Generate Embeddings === Input: 'Ollama makes running LLMs locally easy and efficient.' Embedding dimensions: 3072 First 5 values: [-0.026683128, -0.0028091324, -0.027384995, -0.009667068, -0.017405545] Last 5 values: [-0.028065814, 0.010568945, -0.028453464, 0.014874469, -0.029712567]
10. Copy and Delete Model¶
Endpoints: POST /api/copy, DELETE /api/delete
Warning: Delete is permanent! We safely demonstrate by copying first.
In [31]:
Copied!
print("=== Copy and Delete Model ===")
if not model_available:
print()
print("⚠ Skipping - model not available")
print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}")
else:
COPY_NAME = f"{DEFAULT_MODEL.split(':')[0]}-test-copy:latest"
# Step 1: Copy the model
print(f"\n1. Copying '{DEFAULT_MODEL}' to '{COPY_NAME}'...")
response = requests.post(
f"{OLLAMA_HOST}/api/copy",
json={
"source": DEFAULT_MODEL,
"destination": COPY_NAME
}
)
if response.status_code == 200:
print(f" Copy successful!")
else:
print(f" Copy failed: {response.text}")
# Step 2: Verify the copy exists
print(f"\n2. Verifying '{COPY_NAME}' exists...")
response = requests.get(f"{OLLAMA_HOST}/api/tags")
models = [m["name"] for m in response.json().get("models", [])]
if COPY_NAME in models:
print(f" Found '{COPY_NAME}' in model list")
else:
print(f" '{COPY_NAME}' not found")
# Step 3: Delete the copy
print(f"\n3. Deleting '{COPY_NAME}'...")
response = requests.delete(
f"{OLLAMA_HOST}/api/delete",
json={"model": COPY_NAME}
)
if response.status_code == 200:
print(f" Delete successful!")
else:
print(f" Delete failed: {response.text}")
# Step 4: Verify deletion
print(f"\n4. Verifying '{COPY_NAME}' is deleted...")
response = requests.get(f"{OLLAMA_HOST}/api/tags")
models = [m["name"] for m in response.json().get("models", [])]
if COPY_NAME not in models:
print(f" '{COPY_NAME}' successfully removed")
else:
print(f" '{COPY_NAME}' still exists")
print("=== Copy and Delete Model ===") if not model_available: print() print("⚠ Skipping - model not available") print(f" Run: ujust ollama pull {DEFAULT_MODEL.split(':')[0]}") else: COPY_NAME = f"{DEFAULT_MODEL.split(':')[0]}-test-copy:latest" # Step 1: Copy the model print(f"\n1. Copying '{DEFAULT_MODEL}' to '{COPY_NAME}'...") response = requests.post( f"{OLLAMA_HOST}/api/copy", json={ "source": DEFAULT_MODEL, "destination": COPY_NAME } ) if response.status_code == 200: print(f" Copy successful!") else: print(f" Copy failed: {response.text}") # Step 2: Verify the copy exists print(f"\n2. Verifying '{COPY_NAME}' exists...") response = requests.get(f"{OLLAMA_HOST}/api/tags") models = [m["name"] for m in response.json().get("models", [])] if COPY_NAME in models: print(f" Found '{COPY_NAME}' in model list") else: print(f" '{COPY_NAME}' not found") # Step 3: Delete the copy print(f"\n3. Deleting '{COPY_NAME}'...") response = requests.delete( f"{OLLAMA_HOST}/api/delete", json={"model": COPY_NAME} ) if response.status_code == 200: print(f" Delete successful!") else: print(f" Delete failed: {response.text}") # Step 4: Verify deletion print(f"\n4. Verifying '{COPY_NAME}' is deleted...") response = requests.get(f"{OLLAMA_HOST}/api/tags") models = [m["name"] for m in response.json().get("models", [])] if COPY_NAME not in models: print(f" '{COPY_NAME}' successfully removed") else: print(f" '{COPY_NAME}' still exists")
Out[31]:
=== Copy and Delete Model === 1. Copying 'llama3.2:latest' to 'llama3.2-test-copy:latest'... Copy successful! 2. Verifying 'llama3.2-test-copy:latest' exists... Found 'llama3.2-test-copy:latest' in model list 3. Deleting 'llama3.2-test-copy:latest'... Delete successful! 4. Verifying 'llama3.2-test-copy:latest' is deleted... 'llama3.2-test-copy:latest' successfully removed
11. Error Handling¶
In [32]:
Copied!
print("=== Error Handling ===")
# Test: Non-existent model
print("\n1. Testing non-existent model...")
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": "nonexistent-model-xyz",
"prompt": "Hello",
"stream": False
}
)
if response.status_code != 200:
print(f" Expected error: {response.status_code} - {response.text[:100]}")
else:
print(f" Unexpected success")
# Test: Empty prompt
print("\n2. Testing empty prompt...")
response = requests.post(
f"{OLLAMA_HOST}/api/generate",
json={
"model": DEFAULT_MODEL,
"prompt": "",
"stream": False
}
)
if response.status_code == 200:
print(f" Empty prompts allowed (returned response)")
else:
print(f" Error: {response.status_code}")
print("\nError handling tests completed!")
print("=== Error Handling ===") # Test: Non-existent model print("\n1. Testing non-existent model...") response = requests.post( f"{OLLAMA_HOST}/api/generate", json={ "model": "nonexistent-model-xyz", "prompt": "Hello", "stream": False } ) if response.status_code != 200: print(f" Expected error: {response.status_code} - {response.text[:100]}") else: print(f" Unexpected success") # Test: Empty prompt print("\n2. Testing empty prompt...") response = requests.post( f"{OLLAMA_HOST}/api/generate", json={ "model": DEFAULT_MODEL, "prompt": "", "stream": False } ) if response.status_code == 200: print(f" Empty prompts allowed (returned response)") else: print(f" Error: {response.status_code}") print("\nError handling tests completed!")
Out[32]:
=== Error Handling ===
1. Testing non-existent model...
Expected error: 404 - {"error":"model 'nonexistent-model-xyz' not found"}
2. Testing empty prompt...
Out[32]:
Empty prompts allowed (returned response) Error handling tests completed!
Summary¶
This notebook demonstrated the Ollama REST API using the requests library.
API Endpoints Used¶
| Endpoint | Method | Purpose |
|---|---|---|
/api/tags | GET | List models |
/api/show | POST | Show model details |
/api/ps | GET | List running models |
/api/generate | POST | Generate text |
/api/chat | POST | Chat completion |
/api/embed | POST | Generate embeddings |
/api/copy | POST | Copy a model |
/api/delete | DELETE | Delete a model |
Quick Reference¶
import requests
# Generate
requests.post("http://ollama:11434/api/generate", json={"model": "llama3.2", "prompt": "..."})
# Chat
requests.post("http://ollama:11434/api/chat", json={"model": "llama3.2", "messages": [...]})