Created
January 4, 2026 06:38
-
-
Save iamtgiri/6214aa17ed9f5e52ee73eca5d05488e9 to your computer and use it in GitHub Desktop.
Sequential vs Parallel Computing to Compute Prefix Sum (Scan) Using OpenMP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| How Parallel Computing Works to Compute Prefix Sum (Scan) Using OpenMP | |
| Prefix Sum (Scan) Definition: | |
| Given an array: | |
| A = [a0, a1, a2, a3, ...] | |
| Prefix Sum array P is: | |
| P[i] = a0 + a1 + a2 + ... + ai | |
| Unlike simple sum/count, prefix sum has DATA DEPENDENCIES: | |
| P[i] depends on P[i-1], so naive parallelization is NOT possible. | |
| To parallelize prefix sum, OpenMP uses a TWO-PHASE strategy: | |
| ------------------------------------ | |
| PHASE 1: Local Prefix Sums (Parallel) | |
| ------------------------------------ | |
| 1. OpenMP creates a team of threads (fork) | |
| 2. The array is divided into chunks: | |
| - T0 → arr[0 ... k-1] | |
| - T1 → arr[k ... 2k-1] | |
| - ... | |
| 3. Each thread computes a LOCAL prefix sum for its chunk | |
| 4. Each thread stores the total sum of its chunk | |
| ------------------------------------ | |
| PHASE 2: Offset Adjustment (Parallel) | |
| ------------------------------------ | |
| 5. Prefix sums of chunk totals are computed (small array) | |
| 6. Each thread adds the correct offset to its local prefix results | |
| 7. Threads join → final correct prefix sum array | |
| This program compares: | |
| - Sequential prefix sum | |
| - Parallel prefix sum using OpenMP (two-phase method) | |
| Compile: | |
| >>> g++ -fopenmp -o parallel_prefix_sum parallel_prefix_sum.cpp | |
| Run: | |
| >>> ./parallel_prefix_sum | |
| */ | |
| #include <bits/stdc++.h> | |
| #include <omp.h> | |
| using namespace std; | |
| int main() | |
| { | |
| const long long N = 100000000; // 100 million elements | |
| vector<int> arr(N), seq_prefix(N), par_prefix(N); | |
| // Initialize array | |
| for (long long i = 0; i < N; i++) | |
| { | |
| arr[i] = rand() % 10; | |
| } | |
| // --------------------------- | |
| // 1. Sequential Prefix Sum | |
| // --------------------------- | |
| double start_seq = omp_get_wtime(); | |
| seq_prefix[0] = arr[0]; | |
| for (long long i = 1; i < N; i++) | |
| { | |
| seq_prefix[i] = seq_prefix[i - 1] + arr[i]; | |
| } | |
| double end_seq = omp_get_wtime(); | |
| double seq_time = end_seq - start_seq; | |
| // --------------------------- | |
| // 2. Parallel Prefix Sum | |
| // --------------------------- | |
| double start_par = omp_get_wtime(); | |
| int num_threads; | |
| vector<long long> chunk_sum; | |
| #pragma omp parallel | |
| { | |
| int tid = omp_get_thread_num(); | |
| int total_threads = omp_get_num_threads(); | |
| #pragma omp single | |
| { | |
| num_threads = total_threads; | |
| chunk_sum.resize(num_threads); | |
| } | |
| long long chunk_size = (N + num_threads - 1) / num_threads; | |
| long long start = tid * chunk_size; | |
| long long end = min(start + chunk_size, N); | |
| // Local prefix sum for each chunk | |
| if (start < end) | |
| { | |
| par_prefix[start] = arr[start]; | |
| for (long long i = start + 1; i < end; i++) | |
| { | |
| par_prefix[i] = par_prefix[i - 1] + arr[i]; | |
| } | |
| chunk_sum[tid] = par_prefix[end - 1]; | |
| } | |
| else | |
| { | |
| chunk_sum[tid] = 0; | |
| } | |
| #pragma omp barrier | |
| // Compute offset for each chunk | |
| long long offset = 0; | |
| for (int i = 0; i < tid; i++) | |
| { | |
| offset += chunk_sum[i]; | |
| } | |
| // Apply offset | |
| for (long long i = start; i < end; i++) | |
| { | |
| par_prefix[i] += offset; | |
| } | |
| } | |
| double end_par = omp_get_wtime(); | |
| double par_time = end_par - start_par; | |
| // --------------------------- | |
| // Verification | |
| // --------------------------- | |
| bool correct = true; | |
| for (long long i = 0; i < N; i++) | |
| { | |
| if (seq_prefix[i] != par_prefix[i]) | |
| { | |
| correct = false; | |
| break; | |
| } | |
| } | |
| // --------------------------- | |
| // Print Results | |
| // --------------------------- | |
| cout << "Prefix Sum for array of size N = " << N << "\n\n"; | |
| cout << "Sequential Time: " << seq_time << " sec\n"; | |
| cout << "Parallel Time: " << par_time << " sec\n"; | |
| cout << "\nSpeedup = " << (seq_time / par_time) << "x\n"; | |
| if (correct) | |
| { | |
| cout << "\nPrefix sums match. Parallel execution is correct.\n"; | |
| } | |
| else | |
| { | |
| cout << "\nMismatch detected! Parallel prefix sum is incorrect.\n"; | |
| } | |
| return 0; | |
| } | |
| /* | |
| Output Example: | |
| Prefix Sum for array of size N = 100000000 | |
| Sequential Time: 1.215 sec | |
| Parallel Time: 1.074 sec | |
| Speedup = 1.13129x | |
| Prefix sums match. Parallel execution is correct. | |
| ## Output depends on system architecture and thread count. | |
| */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment