Skip to content

Instantly share code, notes, and snippets.

@wkalt
Created February 12, 2026 03:56
Show Gist options
  • Select an option

  • Save wkalt/420ef473a435d9a57ccb18b15903cb2c to your computer and use it in GitHub Desktop.

Select an option

Save wkalt/420ef473a435d9a57ccb18b15903cb2c to your computer and use it in GitHub Desktop.

Lance Bug Reproduction

git clone <this-gist-url> lance-repro
cd lance-repro
mkdir -p tests && mv repro.rs tests/
cargo test --release --test repro -- --nocapture
[package]
name = "lance-repro"
version = "0.1.0"
edition = "2021"
[dependencies]
arrow-array = "57"
arrow-schema = "57"
futures = "0.3"
lance = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" }
lance-datafusion = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" }
lance-index = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" }
lance-linalg = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" }
tempfile = "3"
tokio = { version = "1", features = ["full"] }
[workspace]
//! Standalone reproduction test generated by vibecheck sequence testing.
//!
//! Sequence ID: 737129
//!
//! Sequence:
//! 0. Write(100 rows)
//! 1. CreateIndex(nullable_int, BTree)
//!
//! Error: Op 1: Invariant violation after CreateIndex(NullableInt, BTree): Invariant violated: Filtered scans match model
use arrow_array::{Int64Array, Int32Array, Float64Array, Float32Array, StringArray, BooleanArray, RecordBatch, RecordBatchIterator, FixedSizeListArray};
use arrow_array::builder::{ListBuilder, StringBuilder};
use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema};
use futures::TryStreamExt;
use lance::dataset::{WriteParams, WriteMode};
use lance::Dataset;
use lance_index::scalar::ScalarIndexParams;
use lance_index::{DatasetIndexExt, IndexType};
use std::collections::HashSet;
use std::sync::Arc;
use tempfile::tempdir;
/// Vector dimension for testing (32D keeps tests fast)
const VECTOR_DIM: i32 = 32;
/// A single row of test data (with ID assigned).
#[derive(Debug, Clone, PartialEq)]
struct TestRow {
id: i64,
int_col: i32,
float_col: f64,
string_col: String,
category: String,
bool_col: bool,
nullable_int: Option<i32>,
labels_col: Vec<Option<String>>,
vector_col: Vec<f32>,
}
impl TestRow {
/// Generate deterministic labels based on the row ID.
fn generate_labels(id: i64) -> Vec<Option<String>> {
let labels = ["alpha", "beta", "gamma", "delta", "epsilon"];
match id % 8 {
0 => vec![Some(labels[(id % 5) as usize].to_string())],
1 => vec![
Some(labels[(id % 5) as usize].to_string()),
Some(labels[((id + 1) % 5) as usize].to_string()),
],
2 => vec![Some(labels[(id % 5) as usize].to_string()), None],
3 => vec![None, Some(labels[(id % 5) as usize].to_string())],
4 => vec![None],
5 => vec![],
6 => vec![
Some(labels[(id % 5) as usize].to_string()),
None,
Some(labels[((id + 2) % 5) as usize].to_string()),
],
_ => vec![
Some(labels[(id % 5) as usize].to_string()),
Some(labels[((id + 1) % 5) as usize].to_string()),
Some(labels[((id + 2) % 5) as usize].to_string()),
],
}
}
/// Generate a deterministic vector based on the row ID.
fn generate_vector(id: i64) -> Vec<f32> {
(0..VECTOR_DIM as usize)
.map(|i| {
let angle = (id as f32 * 0.1) + (i as f32 * 0.3);
angle.sin()
})
.collect()
}
}
/// Create the test schema.
fn test_schema() -> Arc<ArrowSchema> {
Arc::new(ArrowSchema::new(vec![
ArrowField::new("id", DataType::Int64, false),
ArrowField::new("int_col", DataType::Int32, false),
ArrowField::new("float_col", DataType::Float64, false),
ArrowField::new("string_col", DataType::Utf8, false),
ArrowField::new("category", DataType::Utf8, false),
ArrowField::new("bool_col", DataType::Boolean, false),
ArrowField::new("nullable_int", DataType::Int32, true),
ArrowField::new(
"labels_col",
DataType::List(Arc::new(ArrowField::new("item", DataType::Utf8, true))),
false,
),
ArrowField::new(
"vector_col",
DataType::FixedSizeList(
Arc::new(ArrowField::new("item", DataType::Float32, true)),
VECTOR_DIM,
),
false,
),
]))
}
/// Convert rows to a RecordBatch.
fn rows_to_batch(rows: &[TestRow]) -> RecordBatch {
let schema = test_schema();
let id_array: Int64Array = rows.iter().map(|r| r.id).collect();
let int_array: Int32Array = rows.iter().map(|r| r.int_col).collect();
let float_array: Float64Array = rows.iter().map(|r| r.float_col).collect();
let string_array = StringArray::from_iter_values(rows.iter().map(|r| r.string_col.as_str()));
let category_array = StringArray::from_iter_values(rows.iter().map(|r| r.category.as_str()));
let bool_array: BooleanArray = rows.iter().map(|r| Some(r.bool_col)).collect();
let nullable_int_array: Int32Array = rows.iter().map(|r| r.nullable_int).collect();
// Build labels column as List<Utf8> with nullable elements
let mut labels_builder = ListBuilder::new(StringBuilder::new());
for row in rows {
for label in &row.labels_col {
match label {
Some(s) => labels_builder.values().append_value(s),
None => labels_builder.values().append_null(),
}
}
labels_builder.append(true);
}
let labels_array = labels_builder.finish();
// Build vector column as FixedSizeList<Float32, VECTOR_DIM>
let vector_values: Vec<f32> = rows
.iter()
.flat_map(|r| r.vector_col.iter().copied())
.collect();
let vector_values_array = Float32Array::from(vector_values);
let vector_field = Arc::new(ArrowField::new("item", DataType::Float32, true));
let vector_array = FixedSizeListArray::try_new(
vector_field,
VECTOR_DIM,
Arc::new(vector_values_array),
None,
)
.unwrap();
RecordBatch::try_new(
schema,
vec![
Arc::new(id_array),
Arc::new(int_array),
Arc::new(float_array),
Arc::new(string_array),
Arc::new(category_array),
Arc::new(bool_array),
Arc::new(nullable_int_array),
Arc::new(labels_array),
Arc::new(vector_array),
],
)
.unwrap()
}
#[tokio::test(flavor = "multi_thread")]
async fn test_reproduction() {
let dir = tempdir().unwrap();
let uri = dir.path().to_str().unwrap();
let schema = test_schema();
const SEQ_ID: usize = 737129;
// Create empty dataset first to match test harness behavior
let empty_batch = RecordBatch::new_empty(schema.clone());
let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone());
let mut dataset = Dataset::write(reader, uri, None).await.unwrap();
// Step 0: Write(100 rows)
let rows: Vec<TestRow> = (0..100)
.map(|i| {
let seq_idx = SEQ_ID * 100 + 0;
let id = (seq_idx * 10000 + 5000 + i) as i64;
let int_val = (seq_idx * 100 + i) as i32;
TestRow {
id,
int_col: int_val,
float_col: int_val as f64,
string_col: format!("small_{}", id),
category: ["A", "B", "C", "D", "E"][i % 5].to_string(),
bool_col: i % 2 == 0,
nullable_int: if i % 3 == 0 { None } else { Some(int_val) },
labels_col: TestRow::generate_labels(id),
vector_col: TestRow::generate_vector(id),
}
})
.collect();
let batch = rows_to_batch(&rows);
let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone());
dataset.append(reader, None).await.unwrap();
// Step 1: CreateIndex(nullable_int, BTree)
dataset.create_index(&["nullable_int"], IndexType::BTree, None, &ScalarIndexParams::default(), true).await.unwrap();
// Verify filtered scan: (nullable_int != 0) OR (nullable_int < 5)
// Expected: full scan + local filter (ground truth, no index)
use arrow_array::Array;
let all_batches: Vec<_> = dataset.scan().try_into_stream().await.unwrap()
.try_collect::<Vec<_>>().await.unwrap();
let expected_ids: HashSet<i64> = all_batches.iter()
.flat_map(|b| {
let ids = b.column_by_name("id").unwrap().as_any().downcast_ref::<Int64Array>().unwrap();
let nullable_int = b.column_by_name("nullable_int").unwrap().as_any().downcast_ref::<Int32Array>().unwrap();
(0..b.num_rows()).filter_map(move |i| {
if ((!nullable_int.is_null(i) && nullable_int.value(i) != 0)) || ((!nullable_int.is_null(i) && nullable_int.value(i) < 5)) { Some(ids.value(i)) } else { None }
})
})
.collect();
// Actual: filtered scan (may use index)
let filtered: Vec<_> = dataset.scan().filter("(nullable_int != 0) OR (nullable_int < 5)").unwrap()
.try_into_stream().await.unwrap()
.try_collect::<Vec<_>>().await.unwrap();
let actual_ids: HashSet<i64> = filtered.iter()
.flat_map(|b| {
let ids = b.column_by_name("id").unwrap().as_any().downcast_ref::<Int64Array>().unwrap();
(0..b.num_rows()).map(move |i| ids.value(i))
})
.collect();
let only_expected: Vec<_> = expected_ids.difference(&actual_ids).copied().collect();
let only_actual: Vec<_> = actual_ids.difference(&expected_ids).copied().collect();
if !only_expected.is_empty() || !only_actual.is_empty() {
eprintln!("Filtered scan mismatch for: (nullable_int != 0) OR (nullable_int < 5)");
eprintln!(" expected: {} ids, actual: {} ids", expected_ids.len(), actual_ids.len());
if !only_expected.is_empty() { eprintln!(" only in expected ({}): {:?}", only_expected.len(), only_expected); }
if !only_actual.is_empty() { eprintln!(" only in actual ({}): {:?}", only_actual.len(), only_actual); }
panic!("Filtered scan mismatch");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment