[PYTHON] Umgang mit spärlichen Matrizen in Scipy

Ein Memo, das die Ähnlichkeit und Entfernung von Dokumentenvektoren mit geringer Dichte mit Python berechnet

Grundlegende Operationen bei Verwendung spärlicher Matrizen in Scipy Ich habe es vor langer Zeit geschrieben, also stimmt vielleicht etwas nicht

`Matrixberechnung`


import scipy.sparse as sp
import numpy as np

a = sp.lil_matrix((1, 10000)) # 1*Es werden 10000 spärliche Matrizen erstellt
b = sp.lil_matrix((1, 10000))
# a.shape => (1, 10000)
for i in xrange(a.shape[1]):
	r = np.random.rand()
	if r < 0.9:
		r = 0.0
	a[0, i] = r
#Numerische Werte werden zufällig in jedem Element von a gespeichert
a
# => <1x10000 sparse matrix of type '<type 'numpy.float64'>'
        with 947 stored elements in LInked List format>
#b tat dasselbe

`Umwandlung`


ca = a.tocsr()
ca
# => <1x10000 sparse matrix of type '<type 'numpy.float64'>'
        with 947 stored elements in Compressed Sparse Row format>
#lil =>wurde csr

`Matrix Produkt`


#Translokationsmatrix
ta = a.T
#Matrix Produkt
print a.dot(ta) # (1,1)Dies wird aber auch durch eine spärliche Matrix dargestellt
# => (0, 0)        853.19504342

`Vektorgröße`


v = np.array([[1, 1]])
math.sqrt(np.dot(v, v.T))
# => 1.4142135623730951
np.linalg.norm(v)
# => 1.4142135623730951

np.linalg.norm(a)
# =>Fehler tritt auf
np.linalg.norm(a.todense())
np.linalg.norm(a.toarray())
# => 29.209502621916037

#Kosinusähnlichkeit
import scipy.spatial.distance as dis
dis.cosine(a.todense(), b.todense())
# => 0.91347774109309299

`Sparse Matrix Euklidischer Abstand`


# -*- encoding: utf-8 -*-

import scipy.spatial.distance as dis
import scipy.sparse as sp
import numpy as np, scipy.io as io
import math

def sparse_distance(v1, v2):
    """1*Finden Sie den euklidischen Abstand zwischen N Vektoren
    args:
        v1, v2 : 1 *Von N.(Spärlich)Warteschlange
    """
    if not sp.issparse(v1) or not sp.issparse(v2):
        #Verwenden Sie den integrierten euklidischen Wert, wenn es sich nicht um eine dünn besetzte Matrix handelt
        if v1.size != v2.size:
            raise ValueError
        return dis.euclidean(v1, v2)
    indexes1 = v1.rows.item()[:]
    indexes2 = v2.rows.item()[:]
    if indexes1.length != indexes2.length:
        raise ValueError
    indexes = indexes1 + indexes2  #Index, bei dem die beiden Vektoren nicht dünn sind
    euc_dis = 0.0
    for index in indexes:
        _dis = v1[0, index] - v2[0, index]
        euc_dis += _dis ** 2
    return math.sqrt(euc_dis)