git clone <this-gist-url> lance-repro
cd lance-repro
mkdir -p tests && mv repro.rs tests/
cargo test --release --test repro -- --nocapture
Created
February 11, 2026 04:12
-
-
Save wkalt/3381046407097f6180818a13aec95670 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [package] | |
| name = "lance-repro" | |
| version = "0.1.0" | |
| edition = "2021" | |
| [dependencies] | |
| arrow-array = "57" | |
| arrow-schema = "57" | |
| futures = "0.3" | |
| lance = { git = "https://github.com/lancedb/lance.git", rev = "62e2ebbb" } | |
| lance-datafusion = { git = "https://github.com/lancedb/lance.git", rev = "62e2ebbb" } | |
| lance-index = { git = "https://github.com/lancedb/lance.git", rev = "62e2ebbb" } | |
| lance-linalg = { git = "https://github.com/lancedb/lance.git", rev = "62e2ebbb" } | |
| tempfile = "3" | |
| tokio = { version = "1", features = ["full"] } | |
| [workspace] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //! Standalone reproduction test generated by vibecheck sequence testing. | |
| //! | |
| //! Sequence ID: 4842 | |
| //! | |
| //! Sequence: | |
| //! 0. Write(100 rows) | |
| //! 1. Update(category='X' WHERE int_col>1000) | |
| //! 2. CreateIndex(category, Bitmap) | |
| //! | |
| //! Error: Op 2: Invariant violation after CreateIndex(Category, Bitmap): Invariant violated: Index queries match full scan | |
| use arrow_array::{Int64Array, Int32Array, Float64Array, Float32Array, StringArray, BooleanArray, RecordBatch, RecordBatchIterator, FixedSizeListArray}; | |
| use arrow_array::builder::{ListBuilder, StringBuilder}; | |
| use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; | |
| use futures::TryStreamExt; | |
| use lance::dataset::{WriteParams, WriteMode}; | |
| use lance::Dataset; | |
| use lance_index::scalar::ScalarIndexParams; | |
| use lance_index::{DatasetIndexExt, IndexType}; | |
| use std::collections::HashSet; | |
| use std::sync::Arc; | |
| use tempfile::tempdir; | |
| use lance::dataset::UpdateBuilder; | |
| /// Vector dimension for testing (32D keeps tests fast) | |
| const VECTOR_DIM: i32 = 32; | |
| /// A single row of test data (with ID assigned). | |
| #[derive(Debug, Clone, PartialEq)] | |
| struct TestRow { | |
| id: i64, | |
| int_col: i32, | |
| float_col: f64, | |
| string_col: String, | |
| category: String, | |
| bool_col: bool, | |
| nullable_int: Option<i32>, | |
| labels_col: Vec<Option<String>>, | |
| vector_col: Vec<f32>, | |
| } | |
| impl TestRow { | |
| /// Generate deterministic labels based on the row ID. | |
| fn generate_labels(id: i64) -> Vec<Option<String>> { | |
| let labels = ["alpha", "beta", "gamma", "delta", "epsilon"]; | |
| match id % 8 { | |
| 0 => vec![Some(labels[(id % 5) as usize].to_string())], | |
| 1 => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| Some(labels[((id + 1) % 5) as usize].to_string()), | |
| ], | |
| 2 => vec![Some(labels[(id % 5) as usize].to_string()), None], | |
| 3 => vec![None, Some(labels[(id % 5) as usize].to_string())], | |
| 4 => vec![None], | |
| 5 => vec![], | |
| 6 => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| None, | |
| Some(labels[((id + 2) % 5) as usize].to_string()), | |
| ], | |
| _ => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| Some(labels[((id + 1) % 5) as usize].to_string()), | |
| Some(labels[((id + 2) % 5) as usize].to_string()), | |
| ], | |
| } | |
| } | |
| /// Generate a deterministic vector based on the row ID. | |
| fn generate_vector(id: i64) -> Vec<f32> { | |
| (0..VECTOR_DIM as usize) | |
| .map(|i| { | |
| let angle = (id as f32 * 0.1) + (i as f32 * 0.3); | |
| angle.sin() | |
| }) | |
| .collect() | |
| } | |
| } | |
| /// Create the test schema. | |
| fn test_schema() -> Arc<ArrowSchema> { | |
| Arc::new(ArrowSchema::new(vec![ | |
| ArrowField::new("id", DataType::Int64, false), | |
| ArrowField::new("int_col", DataType::Int32, false), | |
| ArrowField::new("float_col", DataType::Float64, false), | |
| ArrowField::new("string_col", DataType::Utf8, false), | |
| ArrowField::new("category", DataType::Utf8, false), | |
| ArrowField::new("bool_col", DataType::Boolean, false), | |
| ArrowField::new("nullable_int", DataType::Int32, true), | |
| ArrowField::new( | |
| "labels_col", | |
| DataType::List(Arc::new(ArrowField::new("item", DataType::Utf8, true))), | |
| false, | |
| ), | |
| ArrowField::new( | |
| "vector_col", | |
| DataType::FixedSizeList( | |
| Arc::new(ArrowField::new("item", DataType::Float32, true)), | |
| VECTOR_DIM, | |
| ), | |
| false, | |
| ), | |
| ])) | |
| } | |
| /// Convert rows to a RecordBatch. | |
| fn rows_to_batch(rows: &[TestRow]) -> RecordBatch { | |
| let schema = test_schema(); | |
| let id_array: Int64Array = rows.iter().map(|r| r.id).collect(); | |
| let int_array: Int32Array = rows.iter().map(|r| r.int_col).collect(); | |
| let float_array: Float64Array = rows.iter().map(|r| r.float_col).collect(); | |
| let string_array = StringArray::from_iter_values(rows.iter().map(|r| r.string_col.as_str())); | |
| let category_array = StringArray::from_iter_values(rows.iter().map(|r| r.category.as_str())); | |
| let bool_array: BooleanArray = rows.iter().map(|r| Some(r.bool_col)).collect(); | |
| let nullable_int_array: Int32Array = rows.iter().map(|r| r.nullable_int).collect(); | |
| // Build labels column as List<Utf8> with nullable elements | |
| let mut labels_builder = ListBuilder::new(StringBuilder::new()); | |
| for row in rows { | |
| for label in &row.labels_col { | |
| match label { | |
| Some(s) => labels_builder.values().append_value(s), | |
| None => labels_builder.values().append_null(), | |
| } | |
| } | |
| labels_builder.append(true); | |
| } | |
| let labels_array = labels_builder.finish(); | |
| // Build vector column as FixedSizeList<Float32, VECTOR_DIM> | |
| let vector_values: Vec<f32> = rows | |
| .iter() | |
| .flat_map(|r| r.vector_col.iter().copied()) | |
| .collect(); | |
| let vector_values_array = Float32Array::from(vector_values); | |
| let vector_field = Arc::new(ArrowField::new("item", DataType::Float32, true)); | |
| let vector_array = FixedSizeListArray::try_new( | |
| vector_field, | |
| VECTOR_DIM, | |
| Arc::new(vector_values_array), | |
| None, | |
| ) | |
| .unwrap(); | |
| RecordBatch::try_new( | |
| schema, | |
| vec![ | |
| Arc::new(id_array), | |
| Arc::new(int_array), | |
| Arc::new(float_array), | |
| Arc::new(string_array), | |
| Arc::new(category_array), | |
| Arc::new(bool_array), | |
| Arc::new(nullable_int_array), | |
| Arc::new(labels_array), | |
| Arc::new(vector_array), | |
| ], | |
| ) | |
| .unwrap() | |
| } | |
| #[tokio::test(flavor = "multi_thread")] | |
| async fn test_reproduction() { | |
| let dir = tempdir().unwrap(); | |
| let uri = dir.path().to_str().unwrap(); | |
| let schema = test_schema(); | |
| const SEQ_ID: usize = 4842; | |
| // Create empty dataset first to match test harness behavior | |
| let empty_batch = RecordBatch::new_empty(schema.clone()); | |
| let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); | |
| let mut dataset = Dataset::write(reader, uri, None).await.unwrap(); | |
| // Step 0: Write(100 rows) | |
| let rows: Vec<TestRow> = (0..100) | |
| .map(|i| { | |
| let seq_idx = SEQ_ID * 100 + 0; | |
| let id = (seq_idx * 10000 + 5000 + i) as i64; | |
| let int_val = (seq_idx * 100 + i) as i32; | |
| TestRow { | |
| id, | |
| int_col: int_val, | |
| float_col: int_val as f64, | |
| string_col: format!("small_{}", id), | |
| category: ["A", "B", "C", "D", "E"][i % 5].to_string(), | |
| bool_col: i % 2 == 0, | |
| nullable_int: if i % 3 == 0 { None } else { Some(int_val) }, | |
| labels_col: TestRow::generate_labels(id), | |
| vector_col: TestRow::generate_vector(id), | |
| } | |
| }) | |
| .collect(); | |
| let batch = rows_to_batch(&rows); | |
| let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); | |
| dataset.append(reader, None).await.unwrap(); | |
| // Step 1: Update(category='X' WHERE int_col>1000) | |
| let result = UpdateBuilder::new(Arc::new(dataset.clone())) | |
| .update_where("int_col > 1000").unwrap() | |
| .set("category", "'X'").unwrap() | |
| .build().unwrap() | |
| .execute().await.unwrap(); | |
| dataset = result.new_dataset.as_ref().clone(); | |
| // Step 2: CreateIndex(category, Bitmap) | |
| dataset.create_index(&["category"], IndexType::Bitmap, None, &ScalarIndexParams::default(), true).await.unwrap(); | |
| // Verify index consistency | |
| // Get expected IDs by filtering full scan locally (ground truth) | |
| let all_batches: Vec<_> = dataset.scan().try_into_stream().await.unwrap() | |
| .try_collect::<Vec<_>>().await.unwrap(); | |
| let expected_ids: HashSet<i64> = all_batches.iter() | |
| .flat_map(|b| { | |
| let ids = b.column_by_name("id").unwrap().as_any().downcast_ref::<Int64Array>().unwrap(); | |
| let vals = b.column_by_name("category").unwrap().as_any().downcast_ref::<StringArray>().unwrap(); | |
| (0..b.num_rows()).filter_map(move |i| { | |
| if true { Some(ids.value(i)) } else { None } | |
| }) | |
| }) | |
| .collect(); | |
| // Get actual IDs from filtered query (may use index) | |
| let filtered: Vec<_> = dataset.scan().filter("category IN ('A', 'B', 'C', 'D', 'E')").unwrap() | |
| .try_into_stream().await.unwrap() | |
| .try_collect::<Vec<_>>().await.unwrap(); | |
| let actual_ids: HashSet<i64> = filtered.iter() | |
| .flat_map(|b| { | |
| let ids = b.column_by_name("id").unwrap().as_any().downcast_ref::<Int64Array>().unwrap(); | |
| (0..b.num_rows()).map(move |i| ids.value(i)) | |
| }) | |
| .collect(); | |
| // Compare in both directions | |
| let only_expected: Vec<_> = expected_ids.difference(&actual_ids).copied().collect(); | |
| let only_actual: Vec<_> = actual_ids.difference(&expected_ids).copied().collect(); | |
| if !only_expected.is_empty() || !only_actual.is_empty() { | |
| eprintln!("Index query mismatch for: category IN ('A', 'B', 'C', 'D', 'E')"); | |
| eprintln!(" expected: {} ids, actual: {} ids", expected_ids.len(), actual_ids.len()); | |
| if !only_expected.is_empty() { eprintln!(" only in expected ({}): {:?}", only_expected.len(), only_expected); } | |
| if !only_actual.is_empty() { eprintln!(" only in actual ({}): {:?}", only_actual.len(), only_actual); } | |
| panic!("Index query mismatch"); | |
| } | |
| eprintln!("Index query returned {} rows (dataset has {} total rows)", actual_ids.len(), all_batches.iter().map(|b| b.num_rows()).sum::<usize>()); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment