Quickstart Guide

Walk a tiny namespace through the core loop: connect, write rows with vectors and attributes, query them, simple aggregations, then layer on conditional writes and branching.

If you are an agent, you may wish to read the full documentation in Markdown.

Connect

  1. Install an SDK:

  2. Create an API key from the Dashboard. The snippets default to gcp-us-central1; change it to your preferred region if needed.

  3. Choose an . Pick from the dropdown in the code sample below, or use random vectors to start (don't use in production or for benchmarking).

# $ pip install turbopuffer sentence-transformers
import os
import uuid
from typing import List
import turbopuffer
from sentence_transformers import SentenceTransformer

tpuf = turbopuffer.Turbopuffer(
    api_key=os.getenv("TURBOPUFFER_API_KEY"), # created here: https://turbopuffer.com/dashboard
    region="gcp-us-central1", # choose best region: https://turbopuffer.com/docs/regions
)
namespace = os.getenv("TURBOPUFFER_NAMESPACE", f"quickstart-{uuid.uuid4().hex[:8]}")
ns = tpuf.namespace(namespace)

# Local embeddings with BGE — no API key needed.
# Model is downloaded on first run (~130 MB).
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")

def embed(text: str) -> List[float]:
    return bge.encode(text).tolist()

Write

Upsert documents with vectors, typed attributes, and full-text search on text and category (with regex on text).

ns.write(
    upsert_rows=[
        {
            'id': 1,
            'vector': embed("walrus narwhal"),
            'category': ["mammal"],
            'public': True,
            'text': "walrus narwhal",
        },
        {
            'id': 2,
            'vector': embed("pufferfish clownfish swordfish"),
            'category': ["fish"],
            'public': False,
            'text': "pufferfish clownfish swordfish",
        },
    ],
    distance_metric='cosine_distance',
    schema={
        "text": {
            "type": "string",
            "full_text_search": True,
            "regex": True,
        },
        "category": {
            "type": "[]string",
            "full_text_search": True,
        },
    }
)

Find documents by vector similarity with filters, by full-text search with a boosted category field, or by regex (\w+fish matches "pufferfish", "swordfish", "clownfish"). To combine vector and FTS concurrently, see hybrid search.

# Vector search with a filter
print(ns.query(
    rank_by=("vector", "ANN", embed("arctic sea mammal")),
    limit=10,
    filters=("public", "Eq", True),
))

# Full-text search with boosted category field
print(ns.query(
    limit=10,
    filters=("public", "Eq", True),
    rank_by=("Sum", [
        ("Product", 2, ("category", "BM25", "mammal")),
        ("text", "BM25", "quick walrus"),
    ]),
))

# Regex filter — matches "pufferfish", "swordfish", "clownfish"
print(ns.query(
    limit=10,
    filters=("text", "Regex", "\\w+fish"),
))

Aggregate

Count documents without returning rows, and use grouped aggregations to split the counts by attribute. Stay in the same namespace and count rows per category.

grouped = ns.query(
    aggregate_by={"count_by_category": ("Count",)},
    group_by=["category"],
)
print(grouped.aggregation_groups)
# [Row(category=['fish'], count_by_category=1), Row(category=['mammal'], count_by_category=1)]

Full runnable example

Prefer one copy-paste program for the core loop? This version covers connect, write, search, and aggregate in one file. Then continue below with the smaller conditional-write and branching snippets.

# $ pip install turbopuffer sentence-transformers
# Sample Python notebook:
# https://colab.research.google.com/drive/17i4sfFTeJQkINCxjBaOGOZeENZr4ZaTE
import os
import uuid
from typing import List

import turbopuffer
from sentence_transformers import SentenceTransformer

tpuf = turbopuffer.Turbopuffer(
    api_key=os.getenv("TURBOPUFFER_API_KEY"), # created here: https://turbopuffer.com/dashboard
    region="gcp-us-central1", # choose best region: https://turbopuffer.com/docs/regions
)

namespace = os.getenv("TURBOPUFFER_NAMESPACE", f"quickstart-{uuid.uuid4().hex[:8]}")
ns = tpuf.namespace(namespace)

# Local embeddings with BGE — no API key needed.
# Model is downloaded on first run (~130 MB).
bge = SentenceTransformer("BAAI/bge-small-en-v1.5")

def embed(text: str) -> List[float]:
    return bge.encode(text).tolist()

# Upsert documents with vectors and attributes
ns.write(
    upsert_rows=[
        {
            'id': 1,
            'vector': embed("walrus narwhal"),
            'category': ["mammal"],
            'public': True,
            'text': "walrus narwhal",
        },
        {
            'id': 2,
            'vector': embed("pufferfish clownfish swordfish"),
            'category': ["fish"],
            'public': False,
            'text': "pufferfish clownfish swordfish",
        },
    ],
    distance_metric='cosine_distance',
    schema={
        "text": {
            # Configure FTS/BM25. Other attributes get inferred types (`public`: int).
            "type": "string",
            # More schema & FTS options:
            # https://turbopuffer.com/docs/write#schema
            "full_text_search": True,
            "regex": True,
        },
        "category": {
            "type": "[]string",
            "full_text_search": True,
        },
    }
)

# Query nearest neighbors with a filter
print(ns.query(
  rank_by=("vector", "ANN", embed("arctic sea mammal")),
  limit=10,
  filters=("public", "Eq", True),
))
# [Row(id=1, vector=None, $dist=0.42773545)]

# Full-text search on an attribute
# To combine FTS and vector search concurrently, see:
# https://turbopuffer.com/docs/hybrid-search
print(ns.query(
  limit=10,
  filters=("public", "Eq", True),
  rank_by=("Sum", [
      ("Product", 2, ("category", "BM25", "mammal")),
      ("text", "BM25", "quick walrus"),
  ]),
))
# [Row(id=1, vector=None, $dist=0.7549128)]

# Regex filter — matches "pufferfish", "swordfish", "clownfish"
print(ns.query(
  limit=10,
  filters=("text", "Regex", "\\w+fish"),
))

# Count documents grouped by category
grouped_result = ns.query(
    aggregate_by={"count_by_category": ("Count",)},
    group_by=["category"],
)
print(grouped_result.aggregation_groups)
# [Row(category=['fish'], count_by_category=1), Row(category=['mammal'], count_by_category=1)]

Conditional writes

Only update a document when a condition is met -- for example, keep only the newest timestamped write. Continue from the same namespace and only apply the write when the new updated_at is newer than the stored one, or when the row has no timestamp yet.

# Only update if this write has a newer timestamp
result = ns.write(
    upsert_rows=[{
        'id': 1,
        'vector': embed("updated walrus"),
        'category': ["mammal"],
        'updated_at': "2024-04-16T09:27:32Z",
    }],
    upsert_condition=(
        'Or', [
            ('updated_at', 'Lt', {'$ref_new': 'updated_at'}),
            ('updated_at', 'Eq', None),
        ]
    ),
    distance_metric='cosine_distance',
)
print(result.rows_affected)  # 1

Branching

Instantly clone a namespace with copy-on-write. Use it to spin up isolated test environments, keep lightweight versioned copies, or take snapshots before risky changes. Constant-time regardless of size, and fully independent after creation. Finally, branch the same namespace into a fresh copy and query it independently.

branch_namespace = f"{namespace}-branch"
branch = tpuf.namespace(branch_namespace)
branch.write(branch_from_namespace=namespace)

# Query the branch independently
print(branch.query(
    rank_by=("vector", "ANN", embed("sea creature")),
    limit=5,
))

What's next

  • Write docs -- schema, patches, deletes, delete-by-filter
  • Query docs -- kNN, hybrid search, ordering, grouped aggregations
  • Concepts -- namespaces, attributes, distance metrics
  • Architecture -- how object storage makes this work