Parallel accumulate vs Sequential accumulate

It looks like the parallel and sequential implementation spent also the same
amount of time adding up numbers. It could be due to the time is so small.
Both sequential and parallel versions take less than a second.

Total Number of numbers: 429496729
Sequential
Time: 948481
Result: 0
Parallel
Time: 941302
Result: 0


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <chrono>

template <typename InputIterator, typename T>
struct accumulate_block {
    void operator() (InputIterator first, InputIterator last, T& result) {
        result = std::accumulate(first, last, result);           
    }
};

template <typename ForwardIterator, typename T>
T parallel_accumulate(ForwardIterator first, ForwardIterator last, T init) {
    size_t totalInput = std::distance(first, last);
    if (totalInput == 0) return init;
    
    // Get maximum thread allowed on the hardware
    size_t hardwareConcurrency = std::thread::hardware_concurrency()? std::thread::hardware_concurrency(): 2;

    // If number per thread is too small, the overhead of bookkeep may outweigh 
    // the benefits.
    const size_t minNumberPerThread = 25;
    // Each thread handles exactly same number of numbers except the main thread
    // which deals with the remainder in addition to its own share
    // e.g. totalInput (80) / minNumberPerThread(25) = numberOfThread(3)
    // Thread 0 handles 25 numbers, and thread 1 25; thread 2 handles 25 + 5 = 30
    size_t numberOfThreads = (totalInput + minNumberPerThread - 1)/ minNumberPerThread;
    
    // make sure there's no oversubscription
    numberOfThreads = std::min(numberOfThreads, hardwareConcurrency);
    size_t numberPerThread = totalInput / numberOfThreads;

    //std::cout << "numberOfThreads: " << numberOfThreads << std::endl;
    //std::cout << "numberPerThread: " << numberPerThread << std::endl;
    
    std::vector<T> results(numberOfThreads);
    std::vector<std::thread> threads(numberOfThreads - 1);
    ForwardIterator begin = first;
    for (size_t i = 0; (i + 1) < numberOfThreads; ++i) {
        ForwardIterator end = std::next(begin, numberOfThreads);
        threads[i] = std::thread(accumulate_block<ForwardIterator, T>(), begin, end, std::ref(results[i]));
        begin = end;
    }
    accumulate_block<ForwardIterator, T>()(begin, last, results.back());
    for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
    return accumulate(results.begin(), results.end(), init);
}

const int N = std::numeric_limits<int>::max() / 5;
int nums[N];
int main(int argc, char* argv[]) {
    std::cout << "Total Number of numbers: " << N << std::endl;
    if (1) {
        std::cout << "Sequential" << std::endl;
        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
        int ret = std::accumulate(std::begin(nums), std::end(nums), 0);
        std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
        std::cout << "Time: " << duration << std::endl;
        std::cout << "Result: " << ret << std::endl;
    }
    if (1) {
        std::cout << "Parallel" << std::endl;
        std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
        int ret = parallel_accumulate(std::begin(nums), std::end(nums), 0);
        std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
        std::cout << "Time: " << duration << std::endl;
        std::cout << "Result: " << ret << std::endl;
    }

    return 0;
}

Popular posts from this blog

LeetCode 68 Text Justification (C++, Python)

How Django Works (4) URL Resolution

Python Class Method and Class Only Method