![]() | How To Do PCA On Small Data |
Principal Component Analysis On Small Data
The defenition for Small Data is less than 2GB, in other words, the Matrix capacity.
The example below is the same as on Visual Studio Magazine - Principal Component Analysis (PCA) from Scratch Using the Classical Technique with C# (2024) by James McCaffrey.
USE [SQX] GO SET NOCOUNT ON; SET TRANSACTION ISOLATION LEVEL READ COMMITTED; DROP TABLE IF EXISTS #SourceData ;WITH sample_data AS (SELECT [ID],[1],[2],[3],[4],[5] FROM (VALUES ((0),(5.1),(3.5),(1.4),(0.2),(0)), ((1),(4.9),(3.1),(1.5),(0.1),(0)), ((2),(5.1),(3.8),(1.5),(0.3),(0)), ((3),(7),(3.2),(4.7),(1.4),(1)), ((4),(5),(2),(3.5),(1),(1)), ((5),(5.9),(3.2),(4.8),(1.8),(1)), ((6),(6.3),(3.3),(6),(2.5),(2)), ((7),(6.5),(3.2),(5.1),(2),(2)), ((8),(6.9),(3.2),(5.7),(2.3),(2)) ) AS T([ID],[1],[2],[3],[4],[5])) SELECT ROW_NUMBER() OVER (ORDER BY s.ID) [ID], s.ID [SourceID] , s.[1], s.[2], s.[3], s.[4], s.[5] INTO #SourceData FROM sample_data s CREATE UNIQUE INDEX IX_SourceData_ID ON #SourceData (ID ASC) /* SMALL DATA Pipeline Start Here */ DECLARE @PCAID int, @MEANS [sqx].[Vector], @STDEVS [sqx].[Vector] , @DATA [sqx].[Matrix], @COV [sqx].[Matrix] DECLARE @Query nvarchar(max) = N' select [1], [2], [3], [4] from #SourceData order by [ID] ' EXEC [sqx].[MatrixFromQuery] @Query = @Query, @SnapShot = 0, @A = @DATA OUTPUT SELECT @MEANS = @DATA.Avg(), @STDEVS = @DATA.StDevp() SET @DATA = @DATA.Standardize(1) SET @COV = @DATA.Covariance(1) INSERT [PCA] ([Description],[DATA],[MEANS],[STDEVS],[COV]) VALUES (N'Hello PCA',@DATA,@MEANS,@STDEVS,@COV) SELECT @PCAID = @@IDENTITY DECLARE @Rotation int, @EIGENVECTORS [sqx].[Matrix] , @EIGENVALUES [sqx].[Vector], @VAREXPLAINED [sqx].[Vector] SELECT @EIGENVECTORS = pcat.EigenVectors , @EIGENVALUES = pcat.EigenValues , @VAREXPLAINED = pcat.VarianceExplained , @Rotation = pcat.Rotations FROM sqx.PrincipalComponentAnalysisTable(@R) pcat UPDATE [PCA] SET [EIGENVECTORS] = @EIGENVECTORS, [EIGENVALUES] = @EIGENVALUES , [VAREXPLAINED] = @VAREXPLAINED WHERE [ID] = @PCAID -- Transform Data UPDATE [PCA] SET [DATA] = [DATA].Product([EIGENVECTORS]) WHERE [ID] = @PCAID -- Show Results PRINT '** Covariance Matrix' SELECT @COV = [COV] FROM [PCA] WHERE [ID] = @PCAID EXEC sqx.MatrixToTable @COV PRINT '** Means, StDevs, Eigen Values and Variance Explained' SELECT m.ID FieldNumber, m.Value Mean, s.Value [StDev] , a.[Value] [EigenValue], b.[Value] [VarianceExplained] FROM sqx.VectorTable(@MEANS) m INNER JOIN sqx.VectorTable(@STDEVS) s ON m.ID = s.ID INNER JOIN [sqx].[VectorTable](@EIGENVALUES) a ON m.ID = a.ID INNER JOIN [sqx].[VectorTable](@VAREXPLAINED) b ON m.[ID] = b.[ID] ORDER BY b.[Value] DESC PRINT '** Eigen Vectors' EXEC sqx.MatrixToTable @EIGENVECTORS PRINT '** Show Data' DROP TABLE IF EXISTS #TransformedData CREATE TABLE #TransformedData ([ID] int NOT NULL PRIMARY KEY CLUSTERED ,[1] float,[2] float,[3] float,[4] float) SELECT @DATA = [DATA] FROM [PCA] WHERE [ID] = @PCAID INSERT #TransformedData EXEC [sqx].[MatrixToTable] @DATA SELECT t.*, s.* FROM #TransformedData t, #SourceData s WHERE t.[ID] = s.[ID] DROP TABLE IF EXISTS #TransformedData DROP TABLE IF EXISTS #SourceData -- And it's all saved in PCA table except Source Data /* PCA Rotations : 1600 ** Covariance Matrix ID 1 2 3 4 ----------- ---------------------- ---------------------- ---------------------- ---------------------- 1 1 0.146574230062663 0.84786268242462 0.813104741419685 2 0.146574230062663 1 -0.175673686188737 -0.0918692447874022 3 0.84786268242462 -0.175673686188737 1 0.986243572832952 4 0.813104741419685 -0.0918692447874022 0.986243572832952 1 ** Means, StDevs, Eigen Values and Variance Explained FieldNumber Mean StDev EigenValue VarianceExplained ----------- ---------------------- ---------------------- ---------------------- ---------------------- 1 5.85555555555556 0.802926745117319 2.77035570477325 0.692588926193314 2 3.16666666666667 0.459468291736341 1.06042756746187 0.265106891865467 3 3.8 1.77576274691563 0.166001159367613 0.0415002898419033 4 1.28888888888889 0.877426089884428 0.00321556839726321 0.000803892099315804 ** Eigen Vectors ID 1 2 3 4 ----------- ---------------------- ---------------------- ---------------------- ---------------------- 1 0.549818337635731 0.239524007501859 -0.782298471507741 0.168336409924838 2 -0.043808150553534 0.963742668683788 0.24202609118867 -0.103461517905271 3 0.59394004579127 -0.110213667522951 0.218796347641008 -0.766300416126443 4 0.585675561680717 -0.0410032423064341 0.530622870709076 0.611344616113363 ** Show Data ID 1 2 3 4 ID SourceID 1 2 3 4 5 ----------- ---------------------- ---------------------- ---------------------- ---------------------- -------------------- ----------- --------------------------------------- --------------------------------------- --------------------------------------- --------------------------------------- ----------- 1 -2.07871621536819 0.673622338895131 -0.0424866147460089 0.0435348734398709 1 0 5.1 3.5 1.4 0.2 0 2 -2.2108338748977 -0.226580717665575 -0.106479578770823 -0.0211532610246278 2 1 4.9 3.1 1.5 0.1 0 3 -2.00712349667912 1.2919978330871 0.18833534486084 0.0025033467578206 3 2 5.1 3.8 1.5 0.3 0 4 1.15568967230191 0.350269293429886 -0.919397986099257 -0.0785324780373214 4 3 7.0 3.2 4.7 1.4 1 5 -0.76779300117014 -2.67020755287846 0.00736139037068315 0.0115130957365036 5 4 5.0 2.0 3.5 1.0 1 6 0.702889293398109 -0.00277478560648754 0.40656253289585 -0.073605566375896 6 5 5.9 3.2 4.8 1.8 1 7 1.83587125085685 0.219111861160725 0.640694005278087 -0.0423781588211668 7 6 6.3 3.3 6.0 2.5 2 8 1.34758960781168 0.148247487518865 -0.0201090024812672 0.0620761484510532 8 7 6.5 3.2 5.1 2.0 2 9 2.02242676374659 0.216314242058818 -0.154480091308105 0.0960419998737658 9 8 6.9 3.2 5.7 2.3 2*/