git clone <this-gist-url> lance-repro
cd lance-repro
mkdir -p tests && mv repro.rs tests/
cargo test --release --test repro -- --nocapture
Created
February 12, 2026 03:26
-
-
Save wkalt/cf5ecbb69dbfaa7c91f8512810c14e0e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [package] | |
| name = "lance-repro" | |
| version = "0.1.0" | |
| edition = "2021" | |
| [dependencies] | |
| arrow-array = "57" | |
| arrow-schema = "57" | |
| futures = "0.3" | |
| lance = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| lance-datafusion = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| lance-index = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| lance-linalg = { git = "https://github.com/lancedb/lance.git", rev = "e0301f58" } | |
| tempfile = "3" | |
| tokio = { version = "1", features = ["full"] } | |
| [workspace] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //! Standalone reproduction test generated by vibecheck sequence testing. | |
| //! | |
| //! Mode: stable_row_ids=true | |
| //! | |
| //! Sequence ID: 0 | |
| //! | |
| //! Sequence: | |
| //! 0. Write(100 rows) | |
| //! 1. CreateIndex(int_col, BTree) | |
| //! 2. Update(int_col=-1 WHERE category='A') | |
| //! 3. OptimizeIndices | |
| //! | |
| //! Error: Op 3: Invariant violation after OptimizeIndices: Invariant violated: Filtered scans match model | |
| use arrow_array::{Int64Array, Int32Array, Float64Array, Float32Array, StringArray, BooleanArray, RecordBatch, RecordBatchIterator, FixedSizeListArray}; | |
| use arrow_array::builder::{ListBuilder, StringBuilder}; | |
| use arrow_schema::{DataType, Field as ArrowField, Schema as ArrowSchema}; | |
| use futures::TryStreamExt; | |
| use lance::dataset::{WriteParams, WriteMode}; | |
| use lance::Dataset; | |
| use lance_index::scalar::ScalarIndexParams; | |
| use lance_index::{DatasetIndexExt, IndexType}; | |
| use std::collections::HashSet; | |
| use std::sync::Arc; | |
| use tempfile::tempdir; | |
| use lance::dataset::UpdateBuilder; | |
| /// Vector dimension for testing (32D keeps tests fast) | |
| const VECTOR_DIM: i32 = 32; | |
| /// A single row of test data (with ID assigned). | |
| #[derive(Debug, Clone, PartialEq)] | |
| struct TestRow { | |
| id: i64, | |
| int_col: i32, | |
| float_col: f64, | |
| string_col: String, | |
| category: String, | |
| bool_col: bool, | |
| nullable_int: Option<i32>, | |
| labels_col: Vec<Option<String>>, | |
| vector_col: Vec<f32>, | |
| } | |
| impl TestRow { | |
| /// Generate deterministic labels based on the row ID. | |
| fn generate_labels(id: i64) -> Vec<Option<String>> { | |
| let labels = ["alpha", "beta", "gamma", "delta", "epsilon"]; | |
| match id % 8 { | |
| 0 => vec![Some(labels[(id % 5) as usize].to_string())], | |
| 1 => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| Some(labels[((id + 1) % 5) as usize].to_string()), | |
| ], | |
| 2 => vec![Some(labels[(id % 5) as usize].to_string()), None], | |
| 3 => vec![None, Some(labels[(id % 5) as usize].to_string())], | |
| 4 => vec![None], | |
| 5 => vec![], | |
| 6 => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| None, | |
| Some(labels[((id + 2) % 5) as usize].to_string()), | |
| ], | |
| _ => vec![ | |
| Some(labels[(id % 5) as usize].to_string()), | |
| Some(labels[((id + 1) % 5) as usize].to_string()), | |
| Some(labels[((id + 2) % 5) as usize].to_string()), | |
| ], | |
| } | |
| } | |
| /// Generate a deterministic vector based on the row ID. | |
| fn generate_vector(id: i64) -> Vec<f32> { | |
| (0..VECTOR_DIM as usize) | |
| .map(|i| { | |
| let angle = (id as f32 * 0.1) + (i as f32 * 0.3); | |
| angle.sin() | |
| }) | |
| .collect() | |
| } | |
| } | |
| /// Create the test schema. | |
| fn test_schema() -> Arc<ArrowSchema> { | |
| Arc::new(ArrowSchema::new(vec![ | |
| ArrowField::new("id", DataType::Int64, false), | |
| ArrowField::new("int_col", DataType::Int32, false), | |
| ArrowField::new("float_col", DataType::Float64, false), | |
| ArrowField::new("string_col", DataType::Utf8, false), | |
| ArrowField::new("category", DataType::Utf8, false), | |
| ArrowField::new("bool_col", DataType::Boolean, false), | |
| ArrowField::new("nullable_int", DataType::Int32, true), | |
| ArrowField::new( | |
| "labels_col", | |
| DataType::List(Arc::new(ArrowField::new("item", DataType::Utf8, true))), | |
| false, | |
| ), | |
| ArrowField::new( | |
| "vector_col", | |
| DataType::FixedSizeList( | |
| Arc::new(ArrowField::new("item", DataType::Float32, true)), | |
| VECTOR_DIM, | |
| ), | |
| false, | |
| ), | |
| ])) | |
| } | |
| /// Convert rows to a RecordBatch. | |
| fn rows_to_batch(rows: &[TestRow]) -> RecordBatch { | |
| let schema = test_schema(); | |
| let id_array: Int64Array = rows.iter().map(|r| r.id).collect(); | |
| let int_array: Int32Array = rows.iter().map(|r| r.int_col).collect(); | |
| let float_array: Float64Array = rows.iter().map(|r| r.float_col).collect(); | |
| let string_array = StringArray::from_iter_values(rows.iter().map(|r| r.string_col.as_str())); | |
| let category_array = StringArray::from_iter_values(rows.iter().map(|r| r.category.as_str())); | |
| let bool_array: BooleanArray = rows.iter().map(|r| Some(r.bool_col)).collect(); | |
| let nullable_int_array: Int32Array = rows.iter().map(|r| r.nullable_int).collect(); | |
| // Build labels column as List<Utf8> with nullable elements | |
| let mut labels_builder = ListBuilder::new(StringBuilder::new()); | |
| for row in rows { | |
| for label in &row.labels_col { | |
| match label { | |
| Some(s) => labels_builder.values().append_value(s), | |
| None => labels_builder.values().append_null(), | |
| } | |
| } | |
| labels_builder.append(true); | |
| } | |
| let labels_array = labels_builder.finish(); | |
| // Build vector column as FixedSizeList<Float32, VECTOR_DIM> | |
| let vector_values: Vec<f32> = rows | |
| .iter() | |
| .flat_map(|r| r.vector_col.iter().copied()) | |
| .collect(); | |
| let vector_values_array = Float32Array::from(vector_values); | |
| let vector_field = Arc::new(ArrowField::new("item", DataType::Float32, true)); | |
| let vector_array = FixedSizeListArray::try_new( | |
| vector_field, | |
| VECTOR_DIM, | |
| Arc::new(vector_values_array), | |
| None, | |
| ) | |
| .unwrap(); | |
| RecordBatch::try_new( | |
| schema, | |
| vec![ | |
| Arc::new(id_array), | |
| Arc::new(int_array), | |
| Arc::new(float_array), | |
| Arc::new(string_array), | |
| Arc::new(category_array), | |
| Arc::new(bool_array), | |
| Arc::new(nullable_int_array), | |
| Arc::new(labels_array), | |
| Arc::new(vector_array), | |
| ], | |
| ) | |
| .unwrap() | |
| } | |
| #[tokio::test(flavor = "multi_thread")] | |
| async fn test_reproduction() { | |
| let dir = tempdir().unwrap(); | |
| let uri = dir.path().to_str().unwrap(); | |
| let schema = test_schema(); | |
| const SEQ_ID: usize = 0; | |
| // Create empty dataset first to match test harness behavior | |
| let empty_batch = RecordBatch::new_empty(schema.clone()); | |
| let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); | |
| let write_params = WriteParams { enable_stable_row_ids: true, ..Default::default() }; | |
| let mut dataset = Dataset::write(reader, uri, Some(write_params)).await.unwrap(); | |
| // Step 0: Write(100 rows) | |
| let rows: Vec<TestRow> = (0..100) | |
| .map(|i| { | |
| let seq_idx = SEQ_ID * 100 + 0; | |
| let id = (seq_idx * 10000 + 5000 + i) as i64; | |
| let int_val = (seq_idx * 100 + i) as i32; | |
| TestRow { | |
| id, | |
| int_col: int_val, | |
| float_col: int_val as f64, | |
| string_col: format!("small_{}", id), | |
| category: ["A", "B", "C", "D", "E"][i % 5].to_string(), | |
| bool_col: i % 2 == 0, | |
| nullable_int: if i % 3 == 0 { None } else { Some(int_val) }, | |
| labels_col: TestRow::generate_labels(id), | |
| vector_col: TestRow::generate_vector(id), | |
| } | |
| }) | |
| .collect(); | |
| let batch = rows_to_batch(&rows); | |
| let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); | |
| dataset.append(reader, None).await.unwrap(); | |
| // Step 1: CreateIndex(int_col, BTree) | |
| dataset.create_index(&["int_col"], IndexType::BTree, None, &ScalarIndexParams::default(), true).await.unwrap(); | |
| // Step 2: Update(int_col=-1 WHERE category='A') | |
| let result = UpdateBuilder::new(Arc::new(dataset.clone())) | |
| .update_where("category = 'A'").unwrap() | |
| .set("int_col", "-1").unwrap() | |
| .build().unwrap() | |
| .execute().await.unwrap(); | |
| dataset = result.new_dataset.as_ref().clone(); | |
| // Step 3: OptimizeIndices | |
| dataset.optimize_indices(&Default::default()).await.unwrap(); | |
| // Dataset error during invariant check | |
| // Invariant: FilteredScanMatch | |
| // Context: filter=int_col < 200 | |
| // Error: Encountered internal error. Please file a bug report at https://github.com/lance-format/lance/issues. RowAddrTreeMap::from_sorted_iter called with non-sorted input, /home/user/.cargo/git/checkouts/lance-8cdf2ec69b3663b6/e0301f5/rust/lance-index/src/scalar/btree/flat.rs:56:29 | |
| // Trigger the bug by doing the exact filtered scan that failed | |
| dataset.scan().filter("int_col < 200").unwrap().try_into_stream().await.unwrap().try_collect::<Vec<_>>().await.unwrap(); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment