Currently, the vector attribute is required for full-text search. We'll remove this requirement soon. If you only need FTS, set the vector to a short, random vector.
turbopuffer supports BM25 full-text search for string and []string types. This guide shows how to configure and use full-text search with different options.
turbopuffer's full-text search engine has been written from the ground up for the turbopuffer storage engine for low latency searches directly on object storage.
For hybrid search combining both vector and BM25 results, see Hybrid Search.
For all available full-text search options, see the Schema documentation.
# $ pip install turbopuffer[fast]
import turbopuffer as tpuf
import os
import random
import uuid
# API tokens are created in the dashboard https://turbopuffer.com/dashboard
tpuf.api_key = os.getenv("TURBOPUFFER_API_KEY")
# Pick the right region https://turbopuffer.com/docs/regions
tpuf.api_base_url = "https://gcp-us-central1.turbopuffer.com"
ns = tpuf.Namespace(f'fts-py-{uuid.uuid4()}')
ns.upsert(
ids=[1, 2, 3],
vectors=[[random.random(), random.random()], [random.random(), random.random()], [random.random(), random.random()]],
attributes={
'content': [
'turbopuffer is a fast search engine with FTS, filtering, and vector search support',
'turbopuffer can store billions and billions of documents cheaper than any other search engine',
'turbopuffer will support many more types of queries as it evolves. turbopuffer will only get faster.'
]
},
distance_metric='cosine_distance',
schema={
'content': {
'type': 'string',
# Enable BM25 with default settings
# For all config options, see https://turbopuffer.com/docs/schema
'full_text_search': True
}
}
)
# Basic FTS search, to combine with vector search, see https://turbopuffer.com/docs/hybrid-search
results = ns.query(
rank_by=['content', 'BM25', 'turbopuffer'],
top_k=10,
include_attributes=['content']
)
# [3, 1, 2] is the default BM25 ranking based on document length and term frequency
print(results)
ns = tpuf.Namespace('fts-py-2')
# Advanced Example
ns.upsert(
ids=[1, 2, 3],
vectors=[[random.random(), random.random()], [random.random(), random.random()], [random.random(), random.random()]],
attributes={
'title': [
'Getting Started with Python',
'Advanced TypeScript Tips',
'Python vs JavaScript'
],
'content': [
'Learn Python basics including variables, functions, and classes',
'Discover advanced TypeScript features and type system tricks',
'Compare Python and JavaScript for web development'
],
'tags': [
['python', 'programming', 'beginner'],
['typescript', 'javascript', 'advanced'],
['python', 'javascript', 'comparison']
],
'language': ['en', 'en', 'en'],
'publish_date': [1709251200, 1709337600, 1709424000]
},
distance_metric='cosine_distance',
schema={
'title': {
'type': 'string',
'full_text_search': {
# See all FTS indexing options at https://turbopuffer.com/docs/schema
'language': 'english',
'stemming': True,
'remove_stopwords': True,
'case_sensitive': False
}
},
'content': {
'type': 'string',
'full_text_search': {
'language': 'english',
'stemming': True,
'remove_stopwords': True
}
},
'tags': {
'type': '[]string',
'full_text_search': {
'stemming': False,
'remove_stopwords': False,
'case_sensitive': True
}
}
}
)
# Advanced FTS search, to combine with vector search, see https://turbopuffer.com/docs/hybrid-search
results = ns.query(
# See all FTS query options at https://turbopuffer.com/docs/query
rank_by=['Sum', [
['title', 'BM25', 'python beginner'],
['content', 'BM25', 'python beginner'],
['tags', 'BM25', 'python beginner']
]],
filters=['And', [
['publish_date', 'Gte', 1709251200],
['language', 'Eq', 'en']
]],
top_k=10,
include_attributes=['title', 'content', 'tags']
)
print(results)