Skip to content

Instantly share code, notes, and snippets.

@iamtgiri
Created January 4, 2026 06:38
Show Gist options
  • Select an option

  • Save iamtgiri/6214aa17ed9f5e52ee73eca5d05488e9 to your computer and use it in GitHub Desktop.

Select an option

Save iamtgiri/6214aa17ed9f5e52ee73eca5d05488e9 to your computer and use it in GitHub Desktop.
Sequential vs Parallel Computing to Compute Prefix Sum (Scan) Using OpenMP
/*
How Parallel Computing Works to Compute Prefix Sum (Scan) Using OpenMP
Prefix Sum (Scan) Definition:
Given an array:
A = [a0, a1, a2, a3, ...]
Prefix Sum array P is:
P[i] = a0 + a1 + a2 + ... + ai
Unlike simple sum/count, prefix sum has DATA DEPENDENCIES:
P[i] depends on P[i-1], so naive parallelization is NOT possible.
To parallelize prefix sum, OpenMP uses a TWO-PHASE strategy:
------------------------------------
PHASE 1: Local Prefix Sums (Parallel)
------------------------------------
1. OpenMP creates a team of threads (fork)
2. The array is divided into chunks:
- T0 → arr[0 ... k-1]
- T1 → arr[k ... 2k-1]
- ...
3. Each thread computes a LOCAL prefix sum for its chunk
4. Each thread stores the total sum of its chunk
------------------------------------
PHASE 2: Offset Adjustment (Parallel)
------------------------------------
5. Prefix sums of chunk totals are computed (small array)
6. Each thread adds the correct offset to its local prefix results
7. Threads join → final correct prefix sum array
This program compares:
- Sequential prefix sum
- Parallel prefix sum using OpenMP (two-phase method)
Compile:
>>> g++ -fopenmp -o parallel_prefix_sum parallel_prefix_sum.cpp
Run:
>>> ./parallel_prefix_sum
*/
#include <bits/stdc++.h>
#include <omp.h>
using namespace std;
int main()
{
const long long N = 100000000; // 100 million elements
vector<int> arr(N), seq_prefix(N), par_prefix(N);
// Initialize array
for (long long i = 0; i < N; i++)
{
arr[i] = rand() % 10;
}
// ---------------------------
// 1. Sequential Prefix Sum
// ---------------------------
double start_seq = omp_get_wtime();
seq_prefix[0] = arr[0];
for (long long i = 1; i < N; i++)
{
seq_prefix[i] = seq_prefix[i - 1] + arr[i];
}
double end_seq = omp_get_wtime();
double seq_time = end_seq - start_seq;
// ---------------------------
// 2. Parallel Prefix Sum
// ---------------------------
double start_par = omp_get_wtime();
int num_threads;
vector<long long> chunk_sum;
#pragma omp parallel
{
int tid = omp_get_thread_num();
int total_threads = omp_get_num_threads();
#pragma omp single
{
num_threads = total_threads;
chunk_sum.resize(num_threads);
}
long long chunk_size = (N + num_threads - 1) / num_threads;
long long start = tid * chunk_size;
long long end = min(start + chunk_size, N);
// Local prefix sum for each chunk
if (start < end)
{
par_prefix[start] = arr[start];
for (long long i = start + 1; i < end; i++)
{
par_prefix[i] = par_prefix[i - 1] + arr[i];
}
chunk_sum[tid] = par_prefix[end - 1];
}
else
{
chunk_sum[tid] = 0;
}
#pragma omp barrier
// Compute offset for each chunk
long long offset = 0;
for (int i = 0; i < tid; i++)
{
offset += chunk_sum[i];
}
// Apply offset
for (long long i = start; i < end; i++)
{
par_prefix[i] += offset;
}
}
double end_par = omp_get_wtime();
double par_time = end_par - start_par;
// ---------------------------
// Verification
// ---------------------------
bool correct = true;
for (long long i = 0; i < N; i++)
{
if (seq_prefix[i] != par_prefix[i])
{
correct = false;
break;
}
}
// ---------------------------
// Print Results
// ---------------------------
cout << "Prefix Sum for array of size N = " << N << "\n\n";
cout << "Sequential Time: " << seq_time << " sec\n";
cout << "Parallel Time: " << par_time << " sec\n";
cout << "\nSpeedup = " << (seq_time / par_time) << "x\n";
if (correct)
{
cout << "\nPrefix sums match. Parallel execution is correct.\n";
}
else
{
cout << "\nMismatch detected! Parallel prefix sum is incorrect.\n";
}
return 0;
}
/*
Output Example:
Prefix Sum for array of size N = 100000000
Sequential Time: 1.215 sec
Parallel Time: 1.074 sec
Speedup = 1.13129x
Prefix sums match. Parallel execution is correct.
## Output depends on system architecture and thread count.
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment