andreacfm · November 18, 2025 15:33
diff --git a/drop_fully_null_columns.py b/drop_fully_null_columns.py
 def drop_fully_null_columns(data_frame):
    """
    Drops DataFrame columns that are fully null (i.e. the maximum value is null).
    Handles duplicate column names by using column positions.

    Arguments:
        data_frame {spark DataFrame} -- spark dataframe

    Returns:
        spark DataFrame -- dataframe with fully null columns removed
    """
    # Get column names and their positions
    columns = data_frame.columns
    # Use positions to reference columns uniquely
    agg_exprs = [F.max(data_frame._jdf.schema().fields()[i].name).alias(f"col_{i}") for i in range(len(columns))]
    # Compute max for each column
    rows_with_data = data_frame.agg(*agg_exprs).collect()[0].asDict()
    # Find columns where max is None
    cols_to_drop_idx = [i for i, val in rows_with_data.items() if val is None]
    # Drop columns by position
    for idx in sorted(cols_to_drop_idx, reverse=True):
        data_frame = data_frame.drop(columns[idx])
    return data_frame
	def drop_fully_null_columns(data_frame):
	"""
	Drops DataFrame columns that are fully null (i.e. the maximum value is null).
	Handles duplicate column names by using column positions.

	Arguments:
	data_frame {spark DataFrame} -- spark dataframe

	Returns:
	spark DataFrame -- dataframe with fully null columns removed
	"""
	# Get column names and their positions
	columns = data_frame.columns
	# Use positions to reference columns uniquely
	agg_exprs = [F.max(data_frame._jdf.schema().fields()[i].name).alias(f"col_{i}") for i in range(len(columns))]
	# Compute max for each column
	rows_with_data = data_frame.agg(*agg_exprs).collect()[0].asDict()
	# Find columns where max is None
	cols_to_drop_idx = [i for i, val in rows_with_data.items() if val is None]
	# Drop columns by position
	for idx in sorted(cols_to_drop_idx, reverse=True):
	data_frame = data_frame.drop(columns[idx])
	return data_frame
No results found