Machine Learning Basics
by Dr. Jane Smith
Machine Learning Basics
Feature Engineering
Introduction
Feature engineering is the process of selecting, creating, and transforming variables (features) to improve machine learning model performance. It's often said that "garbage in, garbage out" - the quality of your features largely determines the quality of your model.
Why Feature Engineering Matters
- Better Model Performance: Good features lead to better predictions
- Improved Interpretability: Meaningful features make models easier to understand
- Reduced Complexity: Proper features can simplify the model
- Faster Training: Well-engineered features can reduce training time
Types of Feature Engineering
- Feature Creation: Creating new features from existing ones
- Feature Selection: Choosing the most relevant features
- Feature Transformation: Modifying features to improve their properties
- Feature Scaling: Normalizing feature ranges
Feature Creation
Polynomial Features
Creating polynomial features can capture non-linear relationships:
1
2 from sklearn.preprocessing import PolynomialFeatures
3 import numpy as np
4
5 X = np.array([[1, 2], [3, 4], [5, 6]])
6 poly = PolynomialFeatures(degree=2, include_bias=False)
7 X_poly = poly.fit_transform(X)
8
9 # Original: [x1, x2]
10 # Transformed: [x1, x2, x1², x1×x2, x2²]
Interaction Features
Interaction terms capture relationships between variables:
C++ Algorithms
C++ Algorithms Example
This snippet demonstrates common algorithms and data structures in C++.
1
2 #include <iostream>
3 #include <vector>
4 #include <algorithm>
5 #include <string>
6 #include <map>
7 #include <set>
8 #include <queue>
9 #include <memory>
10
11 // Template function for binary search
12 template <typename T>
13 int binarySearch(const std::vector<T>& arr, const T& target) {
14 int left = 0;
15 int right = arr.size() - 1;
16
17 while (left <= right) {
18 int mid = left + (right - left) / 2;
19
20 if (arr[mid] == target) {
21 return mid;
22 } else if (arr[mid] < target) {
23 left = mid + 1;
24 } else {
25 right = mid - 1;
26 }
27 }
28
29 return -1; // Not found
30 }
31
32 // Tree node structure
33 struct TreeNode {
34 int val;
35 std::unique_ptr<TreeNode> left;
36 std::unique_ptr<TreeNode> right;
37
38 TreeNode(int x) : val(x), left(nullptr), right(nullptr) {}
39 };
40
41 // Insert into BST
42 std::unique_ptr<TreeNode> insertIntoBST(
43 std::unique_ptr<TreeNode> root,
44 int val
45 ) {
46 if (!root) {
47 return std::make_unique<TreeNode>(val);
48 }
49
50 if (val < root->val) {
51 root->left = insertIntoBST(std::move(root->left), val);
52 } else {
53 root->right = insertIntoBST(std::move(root->right), val);
54 }
55
56 return root;
57 }
58
59 // Inorder traversal
60 void inorderTraversal(const TreeNode* root, std::vector<int>& result) {
61 if (!root) return;
62
63 inorderTraversal(root->left.get(), result);
64 result.push_back(root->val);
65 inorderTraversal(root->right.get(), result);
66 }
67
68 // Graph class using adjacency list
69 class Graph {
70 private:
71 std::map<int, std::vector<int>> adjList;
72 bool directed;
73
74 public:
75 Graph(bool isDirected = false) : directed(isDirected) {}
76
77 void addEdge(int u, int v) {
78 adjList[u].push_back(v);
79 if (!directed) {
80 adjList[v].push_back(u);
81 }
82 }
83
84 // BFS traversal
85 std::vector<int> bfs(int start) {
86 std::vector<int> result;
87 std::queue<int> q;
88 std::set<int> visited;
89
90 q.push(start);
91 visited.insert(start);
92
93 while (!q.empty()) {
94 int current = q.front();
95 q.pop();
96 result.push_back(current);
97
98 for (int neighbor : adjList[current]) {
99 if (visited.find(neighbor) == visited.end()) {
100 visited.insert(neighbor);
101 q.push(neighbor);
102 }
103 }
104 }
105
106 return result;
107 }
108
109 // DFS traversal
110 void dfsUtil(int node, std::set<int>& visited, std::vector<int>& result) {
111 visited.insert(node);
112 result.push_back(node);
113
114 for (int neighbor : adjList[node]) {
115 if (visited.find(neighbor) == visited.end()) {
116 dfsUtil(neighbor, visited, result);
117 }
118 }
119 }
120
121 std::vector<int> dfs(int start) {
122 std::vector<int> result;
123 std::set<int> visited;
124 dfsUtil(start, visited, result);
125 return result;
126 }
127 };
128
129 int main() {
130 std::cout << "=== C++ Algorithms Demo ===\n\n";
131
132 // 1. Binary Search
133 std::vector<int> sortedArray = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
134 int target = 7;
135 int index = binarySearch(sortedArray, target);
136 std::cout << "Binary Search: Found " << target << " at index " << index << "\n\n";
137
138 // 2. BST operations
139 std::unique_ptr<TreeNode> bstRoot;
140 std::vector<int> values = {5, 3, 7, 2, 4, 6, 8};
141
142 for (int val : values) {
143 bstRoot = insertIntoBST(std::move(bstRoot), val);
144 }
145
146 std::vector<int> inorderResult;
147 inorderTraversal(bstRoot.get(), inorderResult);
148
149 std::cout << "BST Inorder Traversal: ";
150 for (int val : inorderResult) {
151 std::cout << val << " ";
152 }
153 std::cout << "\n\n";
154
155 // 3. Graph traversal
156 Graph graph(false); // Undirected graph
157 graph.addEdge(0, 1);
158 graph.addEdge(0, 2);
159 graph.addEdge(1, 3);
160 graph.addEdge(2, 4);
161 graph.addEdge(3, 5);
162 graph.addEdge(4, 5);
163
164 std::vector<int> bfsResult = graph.bfs(0);
165 std::cout << "Graph BFS from node 0: ";
166 for (int node : bfsResult) {
167 std::cout << node << " ";
168 }
169 std::cout << "\n";
170
171 std::vector<int> dfsResult = graph.dfs(0);
172 std::cout << "Graph DFS from node 0: ";
173 for (int node : dfsResult) {
174 std::cout << node << " ";
175 }
176 std::cout << "\n";
177
178 // 4. STL algorithms
179 std::vector<int> numbers = {5, 2, 8, 1, 9, 3, 7, 4, 6};
180
181 // Sort
182 std::sort(numbers.begin(), numbers.end());
183 std::cout << "\nSorted numbers: ";
184 for (int num : numbers) {
185 std::cout << num << " ";
186 }
187 std::cout << "\n";
188
189 // Find
190 auto it = std::find(numbers.begin(), numbers.end(), 7);
191 if (it != numbers.end()) {
192 std::cout << "Found 7 in the vector\n";
193 }
194
195 // Count
196 int count = std::count(numbers.begin(), numbers.end(), 5);
197 std::cout << "Count of 5: " << count << "\n";
198
199 // Accumulate
200 int sum = std::accumulate(numbers.begin(), numbers.end(), 0);
201 std::cout << "Sum of all numbers: " << sum << "\n";
202
203 return 0;
204 }
1
2 #include <vector>
3 #include <algorithm>
4
5 std::vector<std::vector<double>> create_interaction_features(
6 const std::vector<std::vector<double>>& X) {
7
8 size_t n_samples = X.size();
9 size_t n_features = X[0].size();
10 std::vector<std::vector<double>> X_interaction(n_samples);
11
12 for (size_t i = 0; i < n_samples; ++i) {
13 // Original features
14 X_interaction[i] = X[i];
15
16 // Interaction terms
17 for (size_t j = 0; j < n_features; ++j) {
18 for (size_t k = j + 1; k < n_features; ++k) {
19 X_interaction[i].push_back(X[i][j] * X[i][k]);
20 }
21 }
22 }
23
24 return X_interaction;
25 }
Domain-Specific Features
Creating features based on domain knowledge often provides the most value: Example: Time Series Features- Day of week
- Month of year
- Holiday indicator
- Seasonal indicators
- Lag features
- Rolling averages
- Local: Recent changes, short-term trends
- Global: Long-term cycles, seasonal patterns
- Uncertainty: Volatility measures, confidence intervals
Feature Selection
Filter Methods
Filter methods select features based on their statistical properties:
1
2 from sklearn.feature_selection import SelectKBest, f_classif
3 from sklearn.datasets import load_iris
4
5 X, y = load_iris(return_X_y=True)
6
7 # Select top 2 features based on ANOVA F-value
8 selector = SelectKBest(f_classif, k=2)
9 X_selected = selector.fit_transform(X, y)
10
11 print(f"Original features: {X.shape[1]}")
12 print(f"Selected features: {X_selected.shape[1]}")
Wrapper Methods
Wrapper methods use the model to evaluate feature subsets:Integrals
The integral is the inverse operation of differentiation, representing the accumulation of quantities.
Definition
The definite integral of a function from to is defined as:
This represents the area under the curve from to .
Fundamental Theorem of Calculus
The Fundamental Theorem of Calculus connects differentiation and integration:
This theorem shows that differentiation and integration are inverse operations.
Common Integrals
Here are some common integrals:
- Power rule: (for )
- Exponential:
- Trigonometric:
Applications
Integrals have numerous applications in:
- Calculating areas and volumes
- Physics (work, energy)
- Probability (probability distributions)
- Economics (total cost, total revenue)
Embedded Methods
Embedded methods perform feature selection during model training:- Lasso (L1) regularization
- Ridge (L2) regularization
- Decision tree feature importance
- Random forest feature importance
Feature Transformation
Normalization
Min-Max Scaling:
1
2 from sklearn.preprocessing import MinMaxScaler
3
4 scaler = MinMaxScaler()
5 X_scaled = scaler.fit_transform(X)
1
2 from sklearn.preprocessing import StandardScaler
3
4 scaler = StandardScaler()
5 X_scaled = scaler.fit_transform(X)
Log Transformation
Log transformation helps handle skewed data:
1
2 import numpy as np
3
4 # Apply log transformation
5 X_log = np.log1p(X) # log1p handles zero values
Box-Cox Transformation
Box-Cox is a family of power transformations:
1
2 from sklearn.preprocessing import PowerTransformer
3
4 pt = PowerTransformer(method='box-cox')
5 X_transformed = pt.fit_transform(X)
1
2 from sklearn.preprocessing import OneHotEncoder
3
4 encoder = OneHotEncoder(sparse=False)
5 X_encoded = encoder.fit_transform(X_categorical)
Label Encoding
1
2 from sklearn.preprocessing import LabelEncoder
3
4 encoder = LabelEncoder()
5 y_encoded = encoder.fit_transform(y)
Target Encoding
Target encoding uses the target variable to encode categorical features:
1
2 import pandas as pd
3
4 def target_encoding(df, categorical_col, target_col):
5 # Calculate mean target for each category
6 encoding_map = df.groupby(categorical_col)[target_col].mean()
7
8 # Apply encoding
9 df[f'{categorical_col}_encoded'] = df[categorical_col].map(encoding_map)
10
11 return df
Handling Missing Values
Deletion Strategy
Remove rows or columns with too many missing values:
1
2 # Remove rows with >50% missing values
3 threshold = len(df.columns) * 0.5
4 df.dropna(thresh=threshold, inplace=True)
5
6 # Remove columns with >50% missing values
7 df.dropna(axis=1, thresh=len(df) * 0.5, inplace=True)
Imputation Strategies
Mean/Median/Mode Imputation:
1
2 from sklearn.impute import SimpleImputer
3
4 # Mean imputation for numerical features
5 mean_imputer = SimpleImputer(strategy='mean')
6 X_imputed = mean_imputer.fit_transform(X_numerical)
7
8 # Mode imputation for categorical features
9 mode_imputer = SimpleImputer(strategy='most_frequent')
10 X_imputed = mode_imputer.fit_transform(X_categorical)
- KNN imputation
- Regression imputation
- Matrix completion
- Deep learning-based imputation
Feature Engineering for Different Data Types
Text Features
- Bag of Words
- TF-IDF
- Word embeddings (Word2Vec, GloVe)
- Contextual embeddings (BERT, GPT)
Image Features
- Raw pixels
- Histogram of Oriented Gradients (HOG)
- Convolutional Neural Network features
- Pre-trained model embeddings
Time Series Features
- Lag features
- Rolling statistics
- Seasonal decomposition
- Fourier transforms
Best Practices
- Understand Your Data: Explore data distributions and relationships
- Start Simple: Begin with basic features before complex ones
- Validate Properly: Use appropriate validation techniques
- Document Everything: Keep track of feature engineering steps
- Iterate: Continuously refine and improve features
Common Pitfalls
- Data Leakage: Using future information in feature creation
- Overfitting: Creating too many features
- Curse of Dimensionality: Too many features relative to samples
- Ignoring Feature Importance: Not understanding which features matter
Tools and Libraries
Python:- scikit-learn
- pandas
- numpy
- featuretools
- tsfresh (for time series)
- caret
- recipes
- tidyverse