![]() | How To Do PCA On Big Data |
Principal Component Analysis On Big Data
This method is faster and simple. Using MeanVarCorMatrix procedure that computes Mean Variance and Correlation Matrix in just one pass, computes PCA - Principal Component Analysis, and then using special methods of Vector transform all the data or only a chunk of the data using a WHERE clause in the cursor.
There is no need to load all the data into a Matrix and there is no need to transform all the data.
The example below is the same as on Visual Studio Magazine - Principal Component Analysis (PCA) from Scratch Using the Classical Technique with C# (2024) by James McCaffrey.
NOTE: The Covariance Matrix of a standardize table is equal to the Correlation Matrix of the source table.
USE [SQX] GO SET NOCOUNT ON; SET TRANSACTION ISOLATION LEVEL READ COMMITTED; DROP TABLE IF EXISTS #SourceData ;WITH sample_data AS (SELECT [ID],[1],[2],[3],[4],[5] FROM (VALUES ((0),(5.1),(3.5),(1.4),(0.2),(0)), ((1),(4.9),(3.1),(1.5),(0.1),(0)), ((2),(5.1),(3.8),(1.5),(0.3),(0)), ((3),(7),(3.2),(4.7),(1.4),(1)), ((4),(5),(2),(3.5),(1),(1)), ((5),(5.9),(3.2),(4.8),(1.8),(1)), ((6),(6.3),(3.3),(6),(2.5),(2)), ((7),(6.5),(3.2),(5.1),(2),(2)), ((8),(6.9),(3.2),(5.7),(2.3),(2)) ) AS T([ID],[1],[2],[3],[4],[5])) SELECT s.ID, s.[1], s.[2], s.[3], s.[4], s.[5] INTO #SourceData FROM sample_data s CREATE UNIQUE INDEX IX_SourceData_ID ON #SourceData (ID ASC) /* BIG DATA Pipeline Start Here */ DECLARE @Query nvarchar(max) = N' select [1], [2], [3], [4] from #SourceData ' DECLARE @M sqx.Vector, @V sqx.Vector, @R sqx.Matrix EXEC sqx.MeanVarCorMatrix @Query = @Query, @Population = 1, @SnapShot = 0 , @Means = @M OUTPUT, @Vars = @V OUTPUT, @Cor = @R OUTPUT SELECT @V = @V.Sqrt() DECLARE @PCAID int, @Rotation int, @EIGENVECTORS [sqx].[Matrix] , @EIGENVALUES [sqx].[Vector], @VAREXPLAINED [sqx].[Vector] SELECT @EIGENVECTORS = pcat.EigenVectors , @EIGENVALUES = pcat.EigenValues , @VAREXPLAINED = pcat.VarianceExplained , @Rotation = pcat.Rotations FROM sqx.PrincipalComponentAnalysisTable(@R) pcat -- Not Mandatory -- INSERT [PCA] ([Description],[MEANS],[STDEVS],[COR] -- ,[EIGENVECTORS],[EIGENVALUES],[VAREXPLAINED]) -- VALUES (N'Big Data PCA',@M,@V,@R,@EIGENVECTORS,@EIGENVALUES,@VAREXPLAINED) -- SELECT @PCAID = @@IDENTITY DROP TABLE IF EXISTS #TransformedData CREATE TABLE #TransformedData ([ID] int NOT NULL PRIMARY KEY CLUSTERED ,[1] float,[2] float,[3] float,[4] float) DECLARE @ID int, @V1 float, @V2 float, @V3 float, @V4 float ,@VD sqx.Vector = sqx.ZeroVector(4) DECLARE cur CURSOR FAST_FORWARD FOR SELECT [ID], [1], [2], [3], [4] FROM #SourceData -- WHERE [MyDate] BETWEEN @Date1 AND GETDATE() OPEN cur FETCH NEXT FROM cur INTO @ID,@V1,@V2,@V3,@V4 WHILE (@@FETCH_STATUS != -1) BEGIN SELECT @VD = @VD.SetValue(T.X, T.I) FROM (VALUES (1,@V1), (2,@V2), (3,@V3), (4,@V4)) AS T(I,X) SET @VD = @VD.MatrixColumnVectorStandardize(@M,@V).MatrixColumnVectorProduct(@EIGENVECTORS) INSERT #TransformedData EXEC sqx.VectorToTable @ID, @VD FETCH NEXT FROM cur INTO @ID,@V1,@V2,@V3,@V4 END CLOSE cur DEALLOCATE cur -- Show Results PRINT '** Correlation Matrix' EXEC sqx.MatrixToTable @R PRINT '** Means, StDevs, Eigen Values and Variance Explained' SELECT m.ID FieldNumber, m.Value Mean, s.Value [StDev] , a.[Value] [EigenValue], b.[Value] [VarianceExplained] FROM sqx.VectorTable(@M) m INNER JOIN sqx.VectorTable(@V) s ON m.ID = s.ID INNER JOIN [sqx].[VectorTable](@EIGENVALUES) a ON m.ID = a.ID INNER JOIN [sqx].[VectorTable](@VAREXPLAINED) b ON m.[ID] = b.[ID] ORDER BY b.[Value] DESC PRINT '** Eigen Vectors' EXEC sqx.MatrixToTable @EIGENVECTORS PRINT '** Show Data' SELECT t.*, s.* FROM #TransformedData t, #SourceData s WHERE t.[ID] = s.[ID] DROP TABLE IF EXISTS #TransformedData DROP TABLE IF EXISTS #SourceData -- And it can be all saved in PCA table except Data /* PCA Rotations : 1600 ** Correlation Matrix ID 1 2 3 4 ----------- ---------------------- ---------------------- ---------------------- ---------------------- 1 1 0.146574230062663 0.84786268242462 0.813104741419685 2 0.146574230062663 1 -0.175673686188737 -0.0918692447874022 3 0.84786268242462 -0.175673686188737 1 0.986243572832952 4 0.813104741419685 -0.0918692447874022 0.986243572832952 1 ** Means, StDevs, Eigen Values and Variance Explained FieldNumber Mean StDev EigenValue VarianceExplained ----------- ---------------------- ---------------------- ---------------------- ---------------------- 1 5.85555555555556 0.802926745117319 2.77035570477325 0.692588926193314 2 3.16666666666667 0.459468291736341 1.06042756746187 0.265106891865467 3 3.8 1.77576274691563 0.166001159367613 0.0415002898419033 4 1.28888888888889 0.877426089884428 0.00321556839726341 0.000803892099315852 ** Eigen Vectors ID 1 2 3 4 ----------- ---------------------- ---------------------- ---------------------- ---------------------- 1 0.549818337635731 0.239524007501859 -0.782298471507741 0.168336409924837 2 -0.0438081505535337 0.963742668683788 0.24202609118867 -0.103461517905271 3 0.59394004579127 -0.110213667522951 0.218796347641008 -0.766300416126443 4 0.585675561680717 -0.0410032423064341 0.530622870709076 0.611344616113363 ** Show Data ID 1 2 3 4 ID 1 2 3 4 5 ----------- ---------------------- ---------------------- ---------------------- ---------------------- ----------- --------------------------------------- --------------------------------------- --------------------------------------- --------------------------------------- ----------- 0 -2.07871621536819 0.673622338895132 -0.0424866147460087 0.0435348734398706 0 5.1 3.5 1.4 0.2 0 1 -2.2108338748977 -0.226580717665573 -0.106479578770823 -0.0211532610246281 1 4.9 3.1 1.5 0.1 0 2 -2.00712349667912 1.2919978330871 0.18833534486084 0.00250334675782038 2 5.1 3.8 1.5 0.3 0 3 1.15568967230191 0.350269293429887 -0.919397986099258 -0.0785324780373222 3 7.0 3.2 4.7 1.4 1 4 -0.767793001170141 -2.67020755287846 0.00736139037068351 0.0115130957365036 4 5.0 2.0 3.5 1.0 1 5 0.702889293398109 -0.00277478560648666 0.40656253289585 -0.073605566375896 5 5.9 3.2 4.8 1.8 1 6 1.83587125085685 0.219111861160726 0.640694005278087 -0.0423781588211668 6 6.3 3.3 6.0 2.5 2 7 1.34758960781168 0.148247487518865 -0.0201090024812673 0.0620761484510529 7 6.5 3.2 5.1 2.0 2 8 2.02242676374659 0.216314242058819 -0.154480091308106 0.0960419998737654 8 6.9 3.2 5.7 2.3 2*/