You can use isolateforest to find anomalies in the data. A value of -1 indicates an anomaly has occurred. isolateforest is looking for noise in the data. you then use a threshhold to remove that data as a subset to cleanup the classifications. you use elbow and pca to find the number of cluster groups and svm.
data="""ID EyeColor HairColor EducationLevel Income
1 1 1 1 1
2 1 1 2 2
3 2 2 1 1"""
df = pd.read_csv(io.StringIO(data), sep='\t')
print(df.head() )
clf=IsolationForest()
X=df[["EyeColor","HairColor","EducationLevel"]]
y=df["Income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#n_estimators, max_samples, max_features
#-1 represents the outliers (according to the fitted model)
clf = IsolationForest(max_samples=2,n_estimators=10, random_state=10)
clf.fit(X_train)
y_pred_test = clf.predict(X_test)
cm=confusion_matrix(y_test, y_pred_test)
#sns.heatmap(cm)
def plot_detected_anomalies(X, true_labels, predicted_anomalies):
#y_pred_inliers = X[predicted_anomalies == -1, :]
# PLOTTING RESULTS
plt.figure(figsize=(12, 6))
plt.subplot(121)
plt.scatter(X[:, 0], X[:, 1], c=true_labels)
plt.title('Clean data and added noise - TRUE')
plt.xlim([-11, 11])
plt.ylim([-11, 11])
plt.subplot(122)
plt.scatter(X[:, 0], X[:, 1], c=predicted_anomalies)
plt.title('Noise detected via Isolation Forest')
plt.xlim([-11, 11])
plt.ylim([-11, 11])
plt.show()
plot_detected_anomalies(np.array(X_test[["EyeColor","EducationLevel"]]), y_test, y_pred_test)