Click or drag to resize
sqx

How To Do PCA On Big Data

Principal Component Analysis On Big Data

This method is faster and simple. Using MeanVarCorMatrix procedure that computes Mean Variance and Correlation Matrix in just one pass, computes PCA - Principal Component Analysis, and then using special methods of Vector transform all the data or only a chunk of the data using a WHERE clause in the cursor.

There is no need to load all the data into a Matrix and there is no need to transform all the data.

The example below is the same as on Visual Studio Magazine - Principal Component Analysis (PCA) from Scratch Using the Classical Technique with C# (2024) by James McCaffrey.

Principal Component Analysis On Big Data

  • NOTE: The Covariance Matrix of a standardize table is equal to the Correlation Matrix of the source table.

    SQL
    USE [SQX]
    GO
    
    SET NOCOUNT ON;
    SET TRANSACTION ISOLATION LEVEL READ COMMITTED;
    DROP TABLE IF EXISTS #SourceData
    ;WITH sample_data AS (SELECT [ID],[1],[2],[3],[4],[5] FROM (VALUES
    ((0),(5.1),(3.5),(1.4),(0.2),(0)),
    ((1),(4.9),(3.1),(1.5),(0.1),(0)),
    ((2),(5.1),(3.8),(1.5),(0.3),(0)),
    ((3),(7),(3.2),(4.7),(1.4),(1)),
    ((4),(5),(2),(3.5),(1),(1)),
    ((5),(5.9),(3.2),(4.8),(1.8),(1)),
    ((6),(6.3),(3.3),(6),(2.5),(2)),
    ((7),(6.5),(3.2),(5.1),(2),(2)),
    ((8),(6.9),(3.2),(5.7),(2.3),(2))
    ) AS T([ID],[1],[2],[3],[4],[5]))
    SELECT s.ID, s.[1], s.[2], s.[3], s.[4], s.[5]
    INTO #SourceData
    FROM sample_data s
    
    CREATE UNIQUE INDEX IX_SourceData_ID ON #SourceData (ID ASC)
    
    /* BIG DATA Pipeline Start Here */
    DECLARE @Query nvarchar(max) = N'
    select [1], [2], [3], [4]
    from #SourceData
    '
    DECLARE @M sqx.Vector, @V sqx.Vector, @R sqx.Matrix
    EXEC sqx.MeanVarCorMatrix @Query = @Query, @Population = 1, @SnapShot = 0
    , @Means = @M OUTPUT, @Vars = @V OUTPUT, @Cor = @R OUTPUT
    
    SELECT @V = @V.Sqrt()
    
    DECLARE @PCAID int, @Rotation int, @EIGENVECTORS [sqx].[Matrix]
    , @EIGENVALUES [sqx].[Vector], @VAREXPLAINED [sqx].[Vector]
    
    SELECT @EIGENVECTORS = pcat.EigenVectors
    , @EIGENVALUES = pcat.EigenValues
    , @VAREXPLAINED = pcat.VarianceExplained
    , @Rotation = pcat.Rotations
    FROM sqx.PrincipalComponentAnalysisTable(@R) pcat
    
    -- Not Mandatory
    -- INSERT [PCA] ([Description],[MEANS],[STDEVS],[COR]
    -- ,[EIGENVECTORS],[EIGENVALUES],[VAREXPLAINED])
    -- VALUES (N'Big Data PCA',@M,@V,@R,@EIGENVECTORS,@EIGENVALUES,@VAREXPLAINED)
    -- SELECT @PCAID = @@IDENTITY
    
    DROP TABLE IF EXISTS #TransformedData
    CREATE TABLE #TransformedData
    ([ID] int NOT NULL PRIMARY KEY CLUSTERED
    ,[1] float,[2] float,[3] float,[4] float)
    
    DECLARE @ID int, @V1 float, @V2 float, @V3 float, @V4 float
    ,@VD sqx.Vector = sqx.ZeroVector(4)
    DECLARE cur CURSOR FAST_FORWARD
    FOR SELECT [ID], [1], [2], [3], [4]
    FROM #SourceData
    -- WHERE [MyDate] BETWEEN @Date1 AND GETDATE()
    OPEN cur
    FETCH NEXT FROM cur INTO @ID,@V1,@V2,@V3,@V4
    WHILE (@@FETCH_STATUS != -1)
    BEGIN
    SELECT @VD = @VD.SetValue(T.X, T.I)
    FROM (VALUES (1,@V1), (2,@V2), (3,@V3), (4,@V4)) AS T(I,X)
    SET @VD = @VD.MatrixColumnVectorStandardize(@M,@V).MatrixColumnVectorProduct(@EIGENVECTORS)
    INSERT #TransformedData
    EXEC sqx.VectorToTable @ID, @VD
    FETCH NEXT FROM cur INTO @ID,@V1,@V2,@V3,@V4
    END
    CLOSE cur
    DEALLOCATE cur
    
    -- Show Results
    PRINT '** Correlation Matrix'
    EXEC sqx.MatrixToTable @R
    
    PRINT '** Means, StDevs, Eigen Values and Variance Explained'
    SELECT m.ID FieldNumber, m.Value Mean, s.Value [StDev]
    , a.[Value] [EigenValue], b.[Value] [VarianceExplained]
    FROM sqx.VectorTable(@M) m
    INNER JOIN sqx.VectorTable(@V) s ON m.ID = s.ID
    INNER JOIN [sqx].[VectorTable](@EIGENVALUES) a ON m.ID = a.ID
    INNER JOIN [sqx].[VectorTable](@VAREXPLAINED) b ON m.[ID] = b.[ID]
    ORDER BY b.[Value] DESC
    
    PRINT '** Eigen Vectors'
    EXEC sqx.MatrixToTable @EIGENVECTORS
    
    PRINT '** Show Data'
    SELECT t.*, s.*
    FROM #TransformedData t, #SourceData s
    WHERE t.[ID] = s.[ID]
    
    DROP TABLE IF EXISTS #TransformedData
    DROP TABLE IF EXISTS #SourceData
    -- And it can be all saved in PCA table except Data
    /*
    PCA Rotations : 1600
    ** Correlation Matrix
    ID          1                      2                      3                      4
    ----------- ---------------------- ---------------------- ---------------------- ----------------------
    1           1                      0.146574230062663      0.84786268242462       0.813104741419685
    2           0.146574230062663      1                      -0.175673686188737     -0.0918692447874022
    3           0.84786268242462       -0.175673686188737     1                      0.986243572832952
    4           0.813104741419685      -0.0918692447874022    0.986243572832952      1
    
    ** Means, StDevs, Eigen Values and Variance Explained
    FieldNumber Mean                   StDev                  EigenValue             VarianceExplained
    ----------- ---------------------- ---------------------- ---------------------- ----------------------
    1           5.85555555555556       0.802926745117319      2.77035570477325       0.692588926193314
    2           3.16666666666667       0.459468291736341      1.06042756746187       0.265106891865467
    3           3.8                    1.77576274691563       0.166001159367613      0.0415002898419033
    4           1.28888888888889       0.877426089884428      0.00321556839726341    0.000803892099315852
    
    ** Eigen Vectors
    ID          1                      2                      3                      4
    ----------- ---------------------- ---------------------- ---------------------- ----------------------
    1           0.549818337635731      0.239524007501859      -0.782298471507741     0.168336409924837
    2           -0.0438081505535337    0.963742668683788      0.24202609118867       -0.103461517905271
    3           0.59394004579127       -0.110213667522951     0.218796347641008      -0.766300416126443
    4           0.585675561680717      -0.0410032423064341    0.530622870709076      0.611344616113363
    
    ** Show Data
    ID          1                      2                      3                      4                      ID          1                                       2                                       3                                       4                                       5
    ----------- ---------------------- ---------------------- ---------------------- ---------------------- ----------- --------------------------------------- --------------------------------------- --------------------------------------- --------------------------------------- -----------
    0           -2.07871621536819      0.673622338895132      -0.0424866147460087    0.0435348734398706     0           5.1                                     3.5                                     1.4                                     0.2                                     0
    1           -2.2108338748977       -0.226580717665573     -0.106479578770823     -0.0211532610246281    1           4.9                                     3.1                                     1.5                                     0.1                                     0
    2           -2.00712349667912      1.2919978330871        0.18833534486084       0.00250334675782038    2           5.1                                     3.8                                     1.5                                     0.3                                     0
    3           1.15568967230191       0.350269293429887      -0.919397986099258     -0.0785324780373222    3           7.0                                     3.2                                     4.7                                     1.4                                     1
    4           -0.767793001170141     -2.67020755287846      0.00736139037068351    0.0115130957365036     4           5.0                                     2.0                                     3.5                                     1.0                                     1
    5           0.702889293398109      -0.00277478560648666   0.40656253289585       -0.073605566375896     5           5.9                                     3.2                                     4.8                                     1.8                                     1
    6           1.83587125085685       0.219111861160726      0.640694005278087      -0.0423781588211668    6           6.3                                     3.3                                     6.0                                     2.5                                     2
    7           1.34758960781168       0.148247487518865      -0.0201090024812673    0.0620761484510529     7           6.5                                     3.2                                     5.1                                     2.0                                     2
    8           2.02242676374659       0.216314242058819      -0.154480091308106     0.0960419998737654     8           6.9                                     3.2                                     5.7                                     2.3                                     2*/
    
See Also