Skip to content

Instantly share code, notes, and snippets.

@joninvski
Last active December 12, 2025 14:51
Show Gist options
  • Select an option

  • Save joninvski/a1c6fe21fed262739ac95fac5ba72fb6 to your computer and use it in GitHub Desktop.

Select an option

Save joninvski/a1c6fe21fed262739ac95fac5ba72fb6 to your computer and use it in GitHub Desktop.
Transform data
import csv
import sys
def transform_csv(input_file, output_file):
"""
Transform CSV from wide format to long format.
Converts column-based categories (Top, Middle, Bottom) into row-based format.
"""
with open(input_file, 'r', encoding='utf-8') as f:
csv_reader = csv.reader(f)
all_rows = list(csv_reader)
# Parse the three header rows using proper CSV parsing
header_row_0 = all_rows[0] # Top category (PGC, PI, PIP)
header_row_1 = all_rows[1] # Middle category (Primária, Secundária, Multipack)
header_row_2 = all_rows[2] # Bottom category (Material names)
# Debug: print lengths
print(f"Header row 0 length: {len(header_row_0)}")
print(f"Header row 1 length: {len(header_row_1)}")
print(f"Header row 2 length: {len(header_row_2)}")
# Find the maximum column count
max_cols = max(len(header_row_0), len(header_row_1), len(header_row_2))
print(f"Max columns: {max_cols}")
# Pad shorter rows with empty strings
header_row_0.extend([''] * (max_cols - len(header_row_0)))
header_row_1.extend([''] * (max_cols - len(header_row_1)))
header_row_2.extend([''] * (max_cols - len(header_row_2)))
# Strip whitespace
header_row_0 = [cell.strip() for cell in header_row_0]
header_row_1 = [cell.strip() for cell in header_row_1]
header_row_2 = [cell.strip() for cell in header_row_2]
# Create mapping for each value column (columns after the first 3)
column_mapping = []
for i in range(3, max_cols):
column_mapping.append({
'column_index': i,
'top_category': header_row_0[i],
'middle_category': header_row_1[i],
'bottom_category': header_row_2[i]
})
print(f"Column mapping length: {len(column_mapping)}")
# Parse data rows (starting from row 3, which is index 3)
result_rows = []
for row_idx in range(3, len(all_rows)):
cells = all_rows[row_idx]
# Skip empty rows
if not cells or all(cell.strip() == '' for cell in cells):
continue
# Get identifier values (first 3 columns)
if len(cells) < 3:
continue
sector = cells[0].strip()
subsector = cells[1].strip()
cae = cells[2].strip()
# For each value column, create a new row
for col_info in column_mapping:
col_idx = col_info['column_index']
# Get the value at this column index
if col_idx < len(cells):
value = cells[col_idx].strip()
else:
value = ''
result_rows.append({
'Sector': sector,
'SubSector': subsector,
'CAE Principal': cae,
'Top_Category': col_info['top_category'],
'Middle_Category': col_info['middle_category'],
'Bottom_Category': col_info['bottom_category'],
'Value': value
})
# Write output CSV
with open(output_file, 'w', encoding='utf-8', newline='') as f:
fieldnames = ['Sector', 'SubSector', 'CAE Principal', 'Top_Category', 'Middle_Category', 'Bottom_Category', 'Value']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(result_rows)
print(f"Transformation complete! Output saved to: {output_file}")
print(f"Total output rows: {len(result_rows)}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python transform_csv.py <input_file> <output_file>")
print("Example: python transform_csv.py input.csv output.csv")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
transform_csv(input_file, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment