chumo · June 17, 2016 13:14 · geosmart · Apr 2, 2018
diff --git a/parallel_groupby_apply.py b/parallel_groupby_apply.py
 import pandas as pd

 # Spark context
 import pyspark
 sc = pyspark.SparkContext()

 # apply parallel
 def applyParallel(dfGrouped, func):
    
    # rdd with the group of dataframes
    groups = [group for name, group in dfGrouped]
    names = [name for name, group in dfGrouped]
    dummy_rdd = sc.parallelize(groups)
    
    # assuming that func(pandas dataframe) returns a series, the following is a list of pandas series
    ret_list = dummy_rdd.map(func).collect()
    
    # concatenate them in a pandas dataframe and return
    result = pd.concat([S.to_frame().transpose() for S in ret_list])
    result.index = names
    
    return result
    
 # Example:
 ##########
 def f(g):
  return pd.Series({'nrows':g.shape[0],'ncols':g.shape[1]})

 pepe = pd.DataFrame({'a':['q1','q1','q2','q3','q4','q4','q4','q3'],'b':[3,5,3,6,2,4,3,5]})    
 juan = applyParallel(pepe.groupby('a'), f)
	import pandas as pd

	# Spark context
	import pyspark
	sc = pyspark.SparkContext()

	# apply parallel
	def applyParallel(dfGrouped, func):

	# rdd with the group of dataframes
	groups = [group for name, group in dfGrouped]
	names = [name for name, group in dfGrouped]
	dummy_rdd = sc.parallelize(groups)

	# assuming that func(pandas dataframe) returns a series, the following is a list of pandas series
	ret_list = dummy_rdd.map(func).collect()

	# concatenate them in a pandas dataframe and return
	result = pd.concat([S.to_frame().transpose() for S in ret_list])
	result.index = names

	return result

	# Example:
	##########
	def f(g):
	return pd.Series({'nrows':g.shape[0],'ncols':g.shape[1]})

	pepe = pd.DataFrame({'a':['q1','q1','q2','q3','q4','q4','q4','q3'],'b':[3,5,3,6,2,4,3,5]})
	juan = applyParallel(pepe.groupby('a'), f)
No results found