What is Scikit Learn ?
If you’re dealing with Machine Learning in Python, Scikit Learn is considered as the gold standard.
Scikit-learn is an open source Python library that provides a wide selection of supervised and unsupervised learning algorithms.
It implements a range of machine learning tools performing preprocessing, cross-validation and visualization using a unified interface.
Python Knowledge Base: Make coding great again.
- Updated:
2025-01-02 by Andrey BRATUS, Senior Data Analyst.
Initial Data load.
Training And Test Data.
Preprocessing The Data - Normalization.
Preprocessing The Data - Binarization.
Preprocessing The Data - Encoding Categorical Features.
Preprocessing The Data - Imputing Missing Values.
Preprocessing The Data - Generating Polynomial Features.
Creating a Model - Supervised Learning.
Model Fitting.
Prediction.
Evaluating Model’s Performance - Classification Metrics.
Evaluating Model’s Performance - Regression Metrics.
Evaluating Model’s Performance - Clustering Metrics.
Evaluating Model’s Performance - Cross-Validation.
Model Tuning - Grid Search.
Model Tuning - Randomized Parameter Optimization.
Scikit-learn rich set of algorithm offerings includes Regression, Clustering, Decision Trees, Neural Networks, SVMs and Naive Bayes. Corresponding use cases are presented in other sections of this site.
Initial input data should be numeric and stored as NumPy arrays, Pandas DataFrame or SciPy sparse matrices.
import numpy as np
import pandas as pd
set=pd.read_excel('file.xlsx')
from sklearn.model_selection import train_test_split
X = set [[ 'feature1', 'feature1', 'feature1', #...
]]
y = set ['target']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3 , random_state = 103)
Preprocessing The Data - Standardization.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
poly.fit_transform(X)
#Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
#Support Vector Machines (SVM)
from sklearn.svm import SVC
svc = SVC(kernel='linear')
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#KNN
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
#Supervised learning
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
#Unsupervised Learning
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)
#Supervised Estimators
y_pred = svc.predict(np.random.random((2,5)))
y_pred = lr.predict(X_test)
y_pred = knn.predict_proba(X_test)
#Unsupervised Estimators
y_pred = k_means.predict(X_test)
#Accuracy Score
knn.score(X_test, y_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
#Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))
#Mean Absolute Error
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2]
mean_absolute_error(y_true, y_pred)
#Mean Squared Error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
#R² Score
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)
#Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)
#Homogeneity
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)
#V-measure
from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_pred)
from sklearn.cross_validation import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))
from sklearn.grid_search import GridSearchCV
params = {"n_neighbors": np.arange(1,3),
"metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,
param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_.n_neighbors)
from sklearn.grid_search import RandomizedSearchCV
params = {"n_neighbors": range(1,5),
"weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
param_distributions=params,
cv=4,
n_iter=8,
random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)