Comment accélérer les calculs Python

Il y a eu une discussion sur la façon d'accélérer le calcul de l'intégrale dans ce programme. [Écriture d'un programme 1 million de fois plus rapide - essayez d'améliorer la précision honnêtement (11 chiffres 53 secondes)](https://qiita.com/Akai_Banana/items/48a35d2a40d1804d3b32#%E6%84%9A%E7%9B%B4 % E3% 81% AB% E7% B2% BE% E5% BA% A6% E3% 82% 92% E3% 81% 82% E3% 81% 92% E3% 81% A6% E3% 81% BF% E3 Nous procéderons sur la base du code de% 82% 8B11% E6% A1% 8153% E7% A7% 92).

Quand j'ai exécuté le code à portée de main, j'ai obtenu le résultat suivant. Environ 22 secondes

$ python3 -V
Python 3.5.1
$ time python3 wallis.py
36722.3621743
python3 wallis.py  21.89s user 0.07s system 99% cpu 22.014 total

Tout d'abord, déposons-le dans le code de langage C.

`wallis_v1.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

double f(double x)
{
    return sin(M_PI * x);
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product = 1;
    for (int m = 1;m < 11;m++) {
        double sum = 0;
        for (int k = 0;k < n;k++) {
            sum += (1.0 / (double)n) * pow(f((double)k / (double)n), m);
        }
        product *= sum;
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ time ./wallis_v1 1000000
result=36722.362174
./wallis_v1 1000000  0.75s user 0.01s system 97% cpu 0.777 total

Cela seul est beaucoup plus rapide.

Ensuite, pensons à le rendre multi-thread.

`wallis_v2.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

inline double f(double x)
{
    return sin(M_PI * x);
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;
 #pragma omp parallel for
    for (int m = 1;m < 11;m++) {
        double sum = 0;
        for (int k = 0;k < n;k++) {
            double x = (double)k * inv_n;
            sum += inv_n * pow(f(x), m);
        }
        product_array[m - 1] = sum;
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i];
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ g++-6 -O3 -fopenmp -o wallis_v2 wallis_v2.cc
$ time ./wallis_v2 1000000
result=36722.362174
./wallis_v2 1000000  0.86s user 0.01s system 292% cpu 0.296 total

Pour le moment, l'effet du multi-threading s'est manifesté.

Vient ensuite l'implémentation de pow, mais cette fois nous limiterons le traitement à une autre fonction pour les entiers.

`(Extrait) Wallis_v3.cc`


double _pow(double x, int n)
{
    double y = x;
    for (int i = 1;i < n;i++) {
        y *= x;
    }
    return y;
}

//Omission
    for (int m = 1;m < 11;m++) {
        double sum = 0;
        for (int k = 0;k < n;k++) {
            double x = (double)k * inv_n;
            sum += inv_n * _pow(f(x), m);
        }
        product_array[m - 1] = sum;
    }

$ g++-6 -O3 -fopenmp -o wallis_v3 wallis_v3.cc
$ time ./wallis_v3 1000000
result=36722.362174
./wallis_v3 1000000  0.34s user 0.00s system 255% cpu 0.133 total

J'ai essayé d'utiliser l'instruction FMA, mais cela n'a pas été beaucoup plus rapide.

`wallis_v4.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <immintrin.h>

inline double f(double x)
{
    return sin(M_PI * x);
}

double _pow(double x, int n)
{
    double y = x;
    for (int i = 1;i < n;i++) {
        y *= x;
    }
    return y;
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;
    double inv_n2 = 2.0 / _n;
    double inv_n3 = 3.0 / _n;
 #pragma omp parallel for
    for (int m = 1;m < 11;m++) {
        double __attribute__((aligned(32))) vec1[4] = {0, 0, 0, 0}, vec2[4], vec3[4];
        __m256d v1 = _mm256_load_pd(vec1);
        __m256d v2;
        __m256d v3 = _mm256_broadcast_sd(&inv_n);

        for (int k = 0;k < n;k+=4) {
            double x = (double)k * inv_n;
            vec2[0] = _pow(f(x), m);
            vec2[1] = _pow(f(x + inv_n), m);
            vec2[2] = _pow(f(x + inv_n2), m);
            vec2[3] = _pow(f(x + inv_n3), m);
            v2 = _mm256_load_pd(vec2);
            v1 = _mm256_fmadd_pd(v2, v3, v1);
        }
        _mm256_store_pd(vec3, v1);
        product_array[m - 1] = vec3[0] + vec3[1] + vec3[2] + vec3[3];
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i];
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ g++-6 -O3 -fopenmp -mfma -o wallis_v4 wallis_v4.cc
$ time ./wallis_v4 1000000
result=36722.362174
./wallis_v4 1000000  0.33s user 0.00s system 256% cpu 0.128 total

Optimisons maintenant la boucle. Je faisais la même chose pour k dans la boucle de m, alors je l'ai remplacé.

`wallis_v5.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static inline double f(double x)
{
    return sin(M_PI * x);
}

static inline double _pow(double x, int n)
{
    double y = x;
    for (int i = 1;i < n;i++) {
        y *= x;
    }
    return y;
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    for (int i = 0;i < 10;i++) {
        product_array[i] = 0;
    }
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;

    for (int k = 0;k < n;k++) {
        double x = (double)k * inv_n;
        for (int m = 1;m < 11;m++) {
            product_array[m - 1] += _pow(f(x), m);
        }
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i] * inv_n;
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ time ./wallis_v5 1000000
result=36722.362174
./wallis_v5 1000000  0.08s user 0.00s system 95% cpu 0.089 total

Je n'ai plus utilisé OpenMP, mais j'ai pu le faire plus rapidement.

À la fin, j'ai perdu le traitement de _pow. Cela réduit les calculs inutiles.

`wallis_v6.cc`


#include <stdio.h>
#include <stdlib.h>
#include <math.h>

static inline double f(double x)
{
    return sin(M_PI * x);
}

int main(int argc, char** argv)
{
    int n = 0;
    if (argc > 1) {
        n = atoi(argv[1]);
    }

    double product_array[10];
    for (int i = 0;i < 10;i++) {
        product_array[i] = 0;
    }
    
    double _n = (double)n;
    double inv_n = 1.0 / _n;

    for (int k = 0;k < n;k++) {
        double x = (double)k * inv_n;
        double y = f(x);
        double z = y;
        for (int m = 0;m < 10;m++) {
            product_array[m] += z;
            z *= y;
        }
    }
    double product = 1;
    for (int i = 0;i < 10;i++) {
        product *= product_array[i] * inv_n;
    }
    printf("result=%f\n", (1.0 / product));
    return 0;
}

$ g++-6 -O3 -o wallis_v6 wallis_v6.cc
$ time ./wallis_v6 1000000
result=36722.362174
./wallis_v6 1000000  0.04s user 0.00s system 88% cpu 0.049 total

Nous avons pu passer de 0,777 s à 0,049 s au stade du langage C. Sur cette base, nous écrirons du code Python.

`wallis_v2.py`


from numpy import *

def f(x):
    return sin(pi*x)

product_array = [0 for i in range(10)]


n = 1000000
inv_n = 1.0 / n
for k in range(n):
    x = k * inv_n
    y = f(x)
    z = y
    for m in range(10):
        product_array[m] += z
        z *= y

product = prod(list(map(lambda x: x * inv_n, product_array)))
result = 1.0/product
print(result)

$ time python3 wallis_v2.py
36722.3621743
python3 wallis_v2.py  8.87s user 0.09s system 99% cpu 9.040 total

J'ai pu passer de 22 à 9. Il semble important d'éliminer autant que possible les calculs en double.

référence

[Écrivez un programme 1 million de fois plus vite](https://qiita.com/Akai_Banana/items/48a35d2a40d1804d3b32#%E6%84%9A%E7%9B%B4%E3%81%AB%E7%B2%BE%E5 % BA% A6% E3% 82% 92% E3% 81% 82% E3% 81% 92% E3% 81% A6% E3% 81% BF% E3% 82% 8B11% E6% A1% 8153% E7% A7 % 92)