Quickstart Guide
Walk a tiny namespace through the core loop: connect, write rows with vectors and attributes, query them, simple aggregations, then layer on conditional writes and branching.
If you are an agent, you may wish to read the full documentation in Markdown.
Connect
-
Install an SDK:
# https://github.com/turbopuffer/turbopuffer-python pip install turbopuffer -
Create an API key from the Dashboard. The snippets default to
gcp-us-central1; change it to your preferred region if needed. -
Choose an . Pick from the dropdown in the code sample below, or use random vectors to start (don't use in production or for benchmarking).
# $ pip install turbopuffer sentence-transformers
import os
import uuid
from typing import List
import turbopuffer
from sentence_transformers import SentenceTransformer
tpuf = turbopuffer.Turbopuffer(
api_key=os.getenv("TURBOPUFFER_API_KEY"), # created here: https://turbopuffer.com/dashboard
region="gcp-us-central1", # choose best region: https://turbopuffer.com/docs/regions
)
namespace = os.getenv("TURBOPUFFER_NAMESPACE", f"quickstart-{uuid.uuid4().hex[:8]}")
ns = tpuf.namespace(namespace)
# Local embeddings with BGE — no API key needed.
# Model is downloaded on first run (~130 MB).
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
def embed(text: str) -> List[float]:
return bge.encode(text).tolist()Write
Upsert documents with vectors, typed attributes, and
full-text search on text and category (with
regex on text).
ns.write(
upsert_rows=[
{
'id': 1,
'vector': embed("walrus narwhal"),
'category': ["mammal"],
'public': True,
'text': "walrus narwhal",
},
{
'id': 2,
'vector': embed("pufferfish clownfish swordfish"),
'category': ["fish"],
'public': False,
'text': "pufferfish clownfish swordfish",
},
],
distance_metric='cosine_distance',
schema={
"text": {
"type": "string",
"full_text_search": True,
"regex": True,
},
"category": {
"type": "[]string",
"full_text_search": True,
},
}
)Search
Find documents by vector similarity with filters,
by full-text search with a
boosted category field, or by
regex (\w+fish matches "pufferfish",
"swordfish", "clownfish"). To combine vector and FTS concurrently, see
hybrid search.
# Vector search with a filter
print(ns.query(
rank_by=("vector", "ANN", embed("arctic sea mammal")),
limit=10,
filters=("public", "Eq", True),
))
# Full-text search with boosted category field
print(ns.query(
limit=10,
filters=("public", "Eq", True),
rank_by=("Sum", [
("Product", 2, ("category", "BM25", "mammal")),
("text", "BM25", "quick walrus"),
]),
))
# Regex filter — matches "pufferfish", "swordfish", "clownfish"
print(ns.query(
limit=10,
filters=("text", "Regex", "\\w+fish"),
))Aggregate
Count documents without returning rows, and use
grouped aggregations to split the counts by attribute.
Stay in the same namespace and count rows per category.
grouped = ns.query(
aggregate_by={"count_by_category": ("Count",)},
group_by=["category"],
)
print(grouped.aggregation_groups)
# [Row(category=['fish'], count_by_category=1), Row(category=['mammal'], count_by_category=1)]Full runnable example
Prefer one copy-paste program for the core loop? This version covers connect, write, search, and aggregate in one file. Then continue below with the smaller conditional-write and branching snippets.
# $ pip install turbopuffer sentence-transformers
# Sample Python notebook:
# https://colab.research.google.com/drive/17i4sfFTeJQkINCxjBaOGOZeENZr4ZaTE
import os
import uuid
from typing import List
import turbopuffer
from sentence_transformers import SentenceTransformer
tpuf = turbopuffer.Turbopuffer(
api_key=os.getenv("TURBOPUFFER_API_KEY"), # created here: https://turbopuffer.com/dashboard
region="gcp-us-central1", # choose best region: https://turbopuffer.com/docs/regions
)
namespace = os.getenv("TURBOPUFFER_NAMESPACE", f"quickstart-{uuid.uuid4().hex[:8]}")
ns = tpuf.namespace(namespace)
# Local embeddings with BGE — no API key needed.
# Model is downloaded on first run (~130 MB).
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")
def embed(text: str) -> List[float]:
return bge.encode(text).tolist()
# Upsert documents with vectors and attributes
ns.write(
upsert_rows=[
{
'id': 1,
'vector': embed("walrus narwhal"),
'category': ["mammal"],
'public': True,
'text': "walrus narwhal",
},
{
'id': 2,
'vector': embed("pufferfish clownfish swordfish"),
'category': ["fish"],
'public': False,
'text': "pufferfish clownfish swordfish",
},
],
distance_metric='cosine_distance',
schema={
"text": {
# Configure FTS/BM25. Other attributes get inferred types (`public`: int).
"type": "string",
# More schema & FTS options:
# https://turbopuffer.com/docs/write#schema
"full_text_search": True,
"regex": True,
},
"category": {
"type": "[]string",
"full_text_search": True,
},
}
)
# Query nearest neighbors with a filter
print(ns.query(
rank_by=("vector", "ANN", embed("arctic sea mammal")),
limit=10,
filters=("public", "Eq", True),
))
# [Row(id=1, vector=None, $dist=0.42773545)]
# Full-text search on an attribute
# To combine FTS and vector search concurrently, see:
# https://turbopuffer.com/docs/hybrid-search
print(ns.query(
limit=10,
filters=("public", "Eq", True),
rank_by=("Sum", [
("Product", 2, ("category", "BM25", "mammal")),
("text", "BM25", "quick walrus"),
]),
))
# [Row(id=1, vector=None, $dist=0.7549128)]
# Regex filter — matches "pufferfish", "swordfish", "clownfish"
print(ns.query(
limit=10,
filters=("text", "Regex", "\\w+fish"),
))
# Count documents grouped by category
grouped_result = ns.query(
aggregate_by={"count_by_category": ("Count",)},
group_by=["category"],
)
print(grouped_result.aggregation_groups)
# [Row(category=['fish'], count_by_category=1), Row(category=['mammal'], count_by_category=1)]Conditional writes
Only update a document when a condition is met -- for example,
keep only the newest timestamped write.
Continue from the same namespace and only apply the write when the new
updated_at is newer than the stored one, or when the row has no timestamp yet.
# Only update if this write has a newer timestamp
result = ns.write(
upsert_rows=[{
'id': 1,
'vector': embed("updated walrus"),
'category': ["mammal"],
'updated_at': "2024-04-16T09:27:32Z",
}],
upsert_condition=(
'Or', [
('updated_at', 'Lt', {'$ref_new': 'updated_at'}),
('updated_at', 'Eq', None),
]
),
distance_metric='cosine_distance',
)
print(result.rows_affected) # 1Branching
Instantly clone a namespace with copy-on-write. Use it to spin up isolated test environments, keep lightweight versioned copies, or take snapshots before risky changes. Constant-time regardless of size, and fully independent after creation. Finally, branch the same namespace into a fresh copy and query it independently.
branch_namespace = f"{namespace}-branch"
branch = tpuf.namespace(branch_namespace)
branch.write(branch_from_namespace=namespace)
# Query the branch independently
print(branch.query(
rank_by=("vector", "ANN", embed("sea creature")),
limit=5,
))What's next
- Write docs -- schema, patches, deletes, delete-by-filter
- Query docs -- kNN, hybrid search, ordering, grouped aggregations
- Concepts -- namespaces, attributes, distance metrics
- Architecture -- how object storage makes this work