git clone <this-gist-url> lance-repro
cd lance-repro
mkdir -p tests && mv repro.rs tests/
cargo test --release --test repro -- --nocapture
Created
February 12, 2026 03:56
-
-
Save wkalt/420ef473a435d9a57ccb18b15903cb2c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [package] | |
| name = "lance-repro" | |
| version = "0.1.0" | |
| edition = "2021" | |
| [dependencies] | |
| arrow-array = "57" | |
| arrow-schema = "57" | |
| futures = "0.3" | |
| lance = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| lance-datafusion = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| lance-index = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| lance-linalg = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| tempfile = "3" | |
| tokio = { version = "1", features = ["full"] } | |
| [workspace] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //! Standalone reproduction test generated by vibecheck sequence testing. | |
| //! | |
| //! Sequence ID: 737129 | |
| //! | |
| //! Sequence: | |
| //! 0. Write(100 rows) | |
| //! 1. CreateIndex(nullable_int, BTree) | |
| //! | |
| //! Error: Op 1: Invariant violation after CreateIndex(NullableInt, BTree): Invariant violated: Filtered scans match model | |
| use arrow_array::{Int64Array, Int32Array, Float64Array, Float32Array, StringArray, BooleanArray, RecordBatch, RecordBatchIterator, FixedSizeListArray}; | |
| use arrow_array::builder::{ListBuilder, StringBuilder}; | |
| use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; | |
| use futures::TryStreamExt; | |
| use lance::dataset::{WriteParams, WriteMode}; | |
| use lance::Dataset; | |
| use lance_index::scalar::ScalarIndexParams; | |
| use lance_index::{DatasetIndexExt, IndexType}; | |
| use std::collections::HashSet; | |
| use std::sync::Arc; | |
| use tempfile::tempdir; | |
| /// Vector dimension for testing (32D keeps tests fast) | |
| const VECTOR_DIM: i32 = 32; | |
| /// A single row of test data (with ID assigned). | |
| #[derive(Debug, Clone, PartialEq)] | |
| struct TestRow { | |
| id: i64, | |
| int_col: i32, | |
| float_col: f64, | |
| string_col: String, | |
| category: String, | |
| bool_col: bool, | |
| nullable_int: Option<i32>, | |
| labels_col: Vec<Option<String>>, | |
| vector_col: Vec<f32>, | |
| } | |
| impl TestRow { | |
| /// Generate deterministic labels based on the row ID. | |
| fn generate_labels(id: i64) -> Vec<Option<String>> { | |
| let labels = ["alpha", "beta", "gamma", "delta", "epsilon"]; | |
| match id % 8 { | |
| 0 => vec![Some(labels[(id % 5) as usize].to_string())], | |
| 1 => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| Some(labels[((id + 1) % 5) as usize].to_string()), | |
| ], | |
| 2 => vec![Some(labels[(id % 5) as usize].to_string()), None], | |
| 3 => vec![None, Some(labels[(id % 5) as usize].to_string())], | |
| 4 => vec![None], | |
| 5 => vec![], | |
| 6 => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| None, | |
| Some(labels[((id + 2) % 5) as usize].to_string()), | |
| ], | |
| _ => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| Some(labels[((id + 1) % 5) as usize].to_string()), | |
| Some(labels[((id + 2) % 5) as usize].to_string()), | |
| ], | |
| } | |
| } | |
| /// Generate a deterministic vector based on the row ID. | |
| fn generate_vector(id: i64) -> Vec<f32> { | |
| (0..VECTOR_DIM as usize) | |
| .map(|i| { | |
| let angle = (id as f32 * 0.1) + (i as f32 * 0.3); | |
| angle.sin() | |
| }) | |
| .collect() | |
| } | |
| } | |
| /// Create the test schema. | |
| fn test_schema() -> Arc<ArrowSchema> { | |
| Arc::new(ArrowSchema::new(vec![ | |
| ArrowField::new("id", DataType::Int64, false), | |
| ArrowField::new("int_col", DataType::Int32, false), | |
| ArrowField::new("float_col", DataType::Float64, false), | |
| ArrowField::new("string_col", DataType::Utf8, false), | |
| ArrowField::new("category", DataType::Utf8, false), | |
| ArrowField::new("bool_col", DataType::Boolean, false), | |
| ArrowField::new("nullable_int", DataType::Int32, true), | |
| ArrowField::new( | |
| "labels_col", | |
| DataType::List(Arc::new(ArrowField::new("item", DataType::Utf8, true))), | |
| false, | |
| ), | |
| ArrowField::new( | |
| "vector_col", | |
| DataType::FixedSizeList( | |
| Arc::new(ArrowField::new("item", DataType::Float32, true)), | |
| VECTOR_DIM, | |
| ), | |
| false, | |
| ), | |
| ])) | |
| } | |
| /// Convert rows to a RecordBatch. | |
| fn rows_to_batch(rows: &[TestRow]) -> RecordBatch { | |
| let schema = test_schema(); | |
| let id_array: Int64Array = rows.iter().map(|r| r.id).collect(); | |
| let int_array: Int32Array = rows.iter().map(|r| r.int_col).collect(); | |
| let float_array: Float64Array = rows.iter().map(|r| r.float_col).collect(); | |
| let string_array = StringArray::from_iter_values(rows.iter().map(|r| r.string_col.as_str())); | |
| let category_array = StringArray::from_iter_values(rows.iter().map(|r| r.category.as_str())); | |
| let bool_array: BooleanArray = rows.iter().map(|r| Some(r.bool_col)).collect(); | |
| let nullable_int_array: Int32Array = rows.iter().map(|r| r.nullable_int).collect(); | |
| // Build labels column as List<Utf8> with nullable elements | |
| let mut labels_builder = ListBuilder::new(StringBuilder::new()); | |
| for row in rows { | |
| for label in &row.labels_col { | |
| match label { | |
| Some(s) => labels_builder.values().append_value(s), | |
| None => labels_builder.values().append_null(), | |
| } | |
| } | |
| labels_builder.append(true); | |
| } | |
| let labels_array = labels_builder.finish(); | |
| // Build vector column as FixedSizeList<Float32, VECTOR_DIM> | |
| let vector_values: Vec<f32> = rows | |
| .iter() | |
| .flat_map(|r| r.vector_col.iter().copied()) | |
| .collect(); | |
| let vector_values_array = Float32Array::from(vector_values); | |
| let vector_field = Arc::new(ArrowField::new("item", DataType::Float32, true)); | |
| let vector_array = FixedSizeListArray::try_new( | |
| vector_field, | |
| VECTOR_DIM, | |
| Arc::new(vector_values_array), | |
| None, | |
| ) | |
| .unwrap(); | |
| RecordBatch::try_new( | |
| schema, | |
| vec![ | |
| Arc::new(id_array), | |
| Arc::new(int_array), | |
| Arc::new(float_array), | |
| Arc::new(string_array), | |
| Arc::new(category_array), | |
| Arc::new(bool_array), | |
| Arc::new(nullable_int_array), | |
| Arc::new(labels_array), | |
| Arc::new(vector_array), | |
| ], | |
| ) | |
| .unwrap() | |
| } | |
| #[tokio::test(flavor = "multi_thread")] | |
| async fn test_reproduction() { | |
| let dir = tempdir().unwrap(); | |
| let uri = dir.path().to_str().unwrap(); | |
| let schema = test_schema(); | |
| const SEQ_ID: usize = 737129; | |
| // Create empty dataset first to match test harness behavior | |
| let empty_batch = RecordBatch::new_empty(schema.clone()); | |
| let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); | |
| let mut dataset = Dataset::write(reader, uri, None).await.unwrap(); | |
| // Step 0: Write(100 rows) | |
| let rows: Vec<TestRow> = (0..100) | |
| .map(|i| { | |
| let seq_idx = SEQ_ID * 100 + 0; | |
| let id = (seq_idx * 10000 + 5000 + i) as i64; | |
| let int_val = (seq_idx * 100 + i) as i32; | |
| TestRow { | |
| id, | |
| int_col: int_val, | |
| float_col: int_val as f64, | |
| string_col: format!("small_{}", id), | |
| category: ["A", "B", "C", "D", "E"][i % 5].to_string(), | |
| bool_col: i % 2 == 0, | |
| nullable_int: if i % 3 == 0 { None } else { Some(int_val) }, | |
| labels_col: TestRow::generate_labels(id), | |
| vector_col: TestRow::generate_vector(id), | |
| } | |
| }) | |
| .collect(); | |
| let batch = rows_to_batch(&rows); | |
| let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); | |
| dataset.append(reader, None).await.unwrap(); | |
| // Step 1: CreateIndex(nullable_int, BTree) | |
| dataset.create_index(&["nullable_int"], IndexType::BTree, None, &ScalarIndexParams::default(), true).await.unwrap(); | |
| // Verify filtered scan: (nullable_int != 0) OR (nullable_int < 5) | |
| // Expected: full scan + local filter (ground truth, no index) | |
| use arrow_array::Array; | |
| let all_batches: Vec<_> = dataset.scan().try_into_stream().await.unwrap() | |
| .try_collect::<Vec<_>>().await.unwrap(); | |
| let expected_ids: HashSet<i64> = all_batches.iter() | |
| .flat_map(|b| { | |
| let ids = b.column_by_name("id").unwrap().as_any().downcast_ref::<Int64Array>().unwrap(); | |
| let nullable_int = b.column_by_name("nullable_int").unwrap().as_any().downcast_ref::<Int32Array>().unwrap(); | |
| (0..b.num_rows()).filter_map(move |i| { | |
| if ((!nullable_int.is_null(i) && nullable_int.value(i) != 0)) || ((!nullable_int.is_null(i) && nullable_int.value(i) < 5)) { Some(ids.value(i)) } else { None } | |
| }) | |
| }) | |
| .collect(); | |
| // Actual: filtered scan (may use index) | |
| let filtered: Vec<_> = dataset.scan().filter("(nullable_int != 0) OR (nullable_int < 5)").unwrap() | |
| .try_into_stream().await.unwrap() | |
| .try_collect::<Vec<_>>().await.unwrap(); | |
| let actual_ids: HashSet<i64> = filtered.iter() | |
| .flat_map(|b| { | |
| let ids = b.column_by_name("id").unwrap().as_any().downcast_ref::<Int64Array>().unwrap(); | |
| (0..b.num_rows()).map(move |i| ids.value(i)) | |
| }) | |
| .collect(); | |
| let only_expected: Vec<_> = expected_ids.difference(&actual_ids).copied().collect(); | |
| let only_actual: Vec<_> = actual_ids.difference(&expected_ids).copied().collect(); | |
| if !only_expected.is_empty() || !only_actual.is_empty() { | |
| eprintln!("Filtered scan mismatch for: (nullable_int != 0) OR (nullable_int < 5)"); | |
| eprintln!(" expected: {} ids, actual: {} ids", expected_ids.len(), actual_ids.len()); | |
| if !only_expected.is_empty() { eprintln!(" only in expected ({}): {:?}", only_expected.len(), only_expected); } | |
| if !only_actual.is_empty() { eprintln!(" only in actual ({}): {:?}", only_actual.len(), only_actual); } | |
| panic!("Filtered scan mismatch"); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment