Last active
December 12, 2025 14:51
-
-
Save joninvski/a1c6fe21fed262739ac95fac5ba72fb6 to your computer and use it in GitHub Desktop.
Transform data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| import sys | |
| def transform_csv(input_file, output_file): | |
| """ | |
| Transform CSV from wide format to long format. | |
| Converts column-based categories (Top, Middle, Bottom) into row-based format. | |
| """ | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| csv_reader = csv.reader(f) | |
| all_rows = list(csv_reader) | |
| # Parse the three header rows using proper CSV parsing | |
| header_row_0 = all_rows[0] # Top category (PGC, PI, PIP) | |
| header_row_1 = all_rows[1] # Middle category (Primária, Secundária, Multipack) | |
| header_row_2 = all_rows[2] # Bottom category (Material names) | |
| # Debug: print lengths | |
| print(f"Header row 0 length: {len(header_row_0)}") | |
| print(f"Header row 1 length: {len(header_row_1)}") | |
| print(f"Header row 2 length: {len(header_row_2)}") | |
| # Find the maximum column count | |
| max_cols = max(len(header_row_0), len(header_row_1), len(header_row_2)) | |
| print(f"Max columns: {max_cols}") | |
| # Pad shorter rows with empty strings | |
| header_row_0.extend([''] * (max_cols - len(header_row_0))) | |
| header_row_1.extend([''] * (max_cols - len(header_row_1))) | |
| header_row_2.extend([''] * (max_cols - len(header_row_2))) | |
| # Strip whitespace | |
| header_row_0 = [cell.strip() for cell in header_row_0] | |
| header_row_1 = [cell.strip() for cell in header_row_1] | |
| header_row_2 = [cell.strip() for cell in header_row_2] | |
| # Create mapping for each value column (columns after the first 3) | |
| column_mapping = [] | |
| for i in range(3, max_cols): | |
| column_mapping.append({ | |
| 'column_index': i, | |
| 'top_category': header_row_0[i], | |
| 'middle_category': header_row_1[i], | |
| 'bottom_category': header_row_2[i] | |
| }) | |
| print(f"Column mapping length: {len(column_mapping)}") | |
| # Parse data rows (starting from row 3, which is index 3) | |
| result_rows = [] | |
| for row_idx in range(3, len(all_rows)): | |
| cells = all_rows[row_idx] | |
| # Skip empty rows | |
| if not cells or all(cell.strip() == '' for cell in cells): | |
| continue | |
| # Get identifier values (first 3 columns) | |
| if len(cells) < 3: | |
| continue | |
| sector = cells[0].strip() | |
| subsector = cells[1].strip() | |
| cae = cells[2].strip() | |
| # For each value column, create a new row | |
| for col_info in column_mapping: | |
| col_idx = col_info['column_index'] | |
| # Get the value at this column index | |
| if col_idx < len(cells): | |
| value = cells[col_idx].strip() | |
| else: | |
| value = '' | |
| result_rows.append({ | |
| 'Sector': sector, | |
| 'SubSector': subsector, | |
| 'CAE Principal': cae, | |
| 'Top_Category': col_info['top_category'], | |
| 'Middle_Category': col_info['middle_category'], | |
| 'Bottom_Category': col_info['bottom_category'], | |
| 'Value': value | |
| }) | |
| # Write output CSV | |
| with open(output_file, 'w', encoding='utf-8', newline='') as f: | |
| fieldnames = ['Sector', 'SubSector', 'CAE Principal', 'Top_Category', 'Middle_Category', 'Bottom_Category', 'Value'] | |
| writer = csv.DictWriter(f, fieldnames=fieldnames) | |
| writer.writeheader() | |
| writer.writerows(result_rows) | |
| print(f"Transformation complete! Output saved to: {output_file}") | |
| print(f"Total output rows: {len(result_rows)}") | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 3: | |
| print("Usage: python transform_csv.py <input_file> <output_file>") | |
| print("Example: python transform_csv.py input.csv output.csv") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| output_file = sys.argv[2] | |
| transform_csv(input_file, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment