Shape Your Data Smarter: Dimensionality Reduction in Python and R.

Data Science Dimensionality Reduction use case.


Python programming language and its libraries combined together and R language in addition form the powerful tools for solving Dimensionality Reduction tasks.

Dimensionality Reduction.
Dimensionality Reduction meme.

Python Knowledge Base: Make coding great again.
- Updated: 2024-07-26 by Andrey BRATUS, Senior Data Analyst.





    For Dimensionality Reduction 3 main methods are Principal Component Analysis (PCA), Linear Discriminant Analysis (LDA) and Kernel PCA. Usage of Python and R for building these 3 models is described below.


  1. Principal Component Analysis (PCA) in Python - Classification case.


  2.         
    #Importing the libraries
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    
    #Importing the dataset
    dataset = pd.read_csv('my_dataset.csv')
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    
    #Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    #Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    #Applying PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components = 2)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    #Training the Logistic Regression model on the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    
    #Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix, accuracy_score
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    accuracy_score(y_test, y_pred)
    



  3. Principal Component Analysis (PCA) in R - Classification case.


  4.          
    # Importing the dataset
    dataset = read.csv('my_dataset.csv')
    
    # Splitting the dataset into the Training set and Test set
    # install.packages('caTools')
    library(caTools)
    set.seed(123)
    split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
    training_set = subset(dataset, split == TRUE)
    test_set = subset(dataset, split == FALSE)
    
    # Feature Scaling
    training_set[-14] = scale(training_set[-14])
    test_set[-14] = scale(test_set[-14])
    
    # Applying PCA
    # install.packages('caret')
    library(caret)
    # install.packages('e1071')
    library(e1071)
    pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
    training_set = predict(pca, training_set)
    training_set = training_set[c(2, 3, 1)]
    test_set = predict(pca, test_set)
    test_set = test_set[c(2, 3, 1)]
    
    # Fitting SVM to the Training set
    # install.packages('e1071')
    library(e1071)
    classifier = svm(formula = Customer_Segment ~ .,
                     data = training_set,
                     type = 'C-classification',
                     kernel = 'linear')
    
    # Predicting the Test set results
    y_pred = predict(classifier, newdata = test_set[-3])
    
    # Making the Confusion Matrix
    cm = table(test_set[, 3], y_pred)
    



  5. Linear Discriminant Analysis (LDA) in Python - Classification case.


  6.          
    #Importing the libraries
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    
    #Importing the dataset
    dataset = pd.read_csv('my_dataset.csv')
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    
    #Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    #Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    #Applying LDA
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
    lda = LDA(n_components = 2)
    X_train = lda.fit_transform(X_train, y_train)
    X_test = lda.transform(X_test)
    
    #Training the Logistic Regression model on the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    
    #Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix, accuracy_score
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    accuracy_score(y_test, y_pred)
    

  7. Linear Discriminant Analysis (LDA) in R - Classification case.


  8.           
    # Importing the dataset
    dataset = read.csv('my_dataset.csv')
    
    # Splitting the dataset into the Training set and Test set
    # install.packages('caTools')
    library(caTools)
    set.seed(123)
    split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
    training_set = subset(dataset, split == TRUE)
    test_set = subset(dataset, split == FALSE)
    
    # Feature Scaling
    training_set[-14] = scale(training_set[-14])
    test_set[-14] = scale(test_set[-14])
    
    # Applying LDA
    library(MASS)
    lda = lda(formula = Customer_Segment ~ ., data = training_set)
    training_set = as.data.frame(predict(lda, training_set))
    training_set = training_set[c(5, 6, 1)]
    test_set = as.data.frame(predict(lda, test_set))
    test_set = test_set[c(5, 6, 1)]
    
    # Fitting SVM to the Training set
    # install.packages('e1071')
    library(e1071)
    classifier = svm(formula = class ~ .,
                     data = training_set,
                     type = 'C-classification',
                     kernel = 'linear')
    
    # Predicting the Test set results
    y_pred = predict(classifier, newdata = test_set[-3])
    
    # Making the Confusion Matrix
    cm = table(test_set[, 3], y_pred)
    

  9. Kernel PCA in Python - Classification case.


  10.         
    #Importing the libraries
    import numpy as np
    import matplotlib.pyplot as plt
    import pandas as pd
    
    #Importing the dataset
    dataset = pd.read_csv('my_dataset.csv')
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values
    
    #Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
    
    #Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    #Applying Kernel PCA
    from sklearn.decomposition import KernelPCA
    kpca = KernelPCA(n_components = 2, kernel = 'rbf')
    X_train = kpca.fit_transform(X_train)
    X_test = kpca.transform(X_test)
    
    #Training the Logistic Regression model on the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    
    #Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix, accuracy_score
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    accuracy_score(y_test, y_pred)
    

  11. Kernel PCA in R - Classification case.


  12.          
    # Importing the dataset
    dataset = read.csv('my_dataset.csv')
    dataset = dataset[, 3:5]
    
    # Splitting the dataset into the Training set and Test set
    # install.packages('caTools')
    library(caTools)
    set.seed(123)
    split = sample.split(dataset$Purchased, SplitRatio = 0.8)
    training_set = subset(dataset, split == TRUE)
    test_set = subset(dataset, split == FALSE)
    
    # Feature Scaling
    training_set[, 1:2] = scale(training_set[, 1:2])
    test_set[, 1:2] = scale(test_set[, 1:2])
    
    # Applying Kernel PCA
    # install.packages('kernlab')
    library(kernlab)
    kpca = kpca(~., data = training_set[-3], kernel = 'rbfdot', features = 2)
    training_set_pca = as.data.frame(predict(kpca, training_set))
    training_set_pca$Purchased = training_set$Purchased
    test_set_pca = as.data.frame(predict(kpca, test_set))
    test_set_pca$Purchased = test_set$Purchased
    
    # Fitting Logistic Regression to the Training set
    classifier = glm(formula = Purchased ~ .,
                     family = binomial,
                     data = training_set_pca)
    
    # Predicting the Test set results
    prob_pred = predict(classifier, type = 'response', newdata = test_set_pca[-3])
    y_pred = ifelse(prob_pred > 0.5, 1, 0)
    
    # Making the Confusion Matrix
    cm = table(test_set_pca[, 3], y_pred)
    




See also related topics: