ARTICLE AD BOX
Regression
import pandas as pd import numpy as np from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.metrics import mean_absolute_error from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder from sklearn.impute import SimpleImputer from xgboost import XGBRegressor TARGET_COL = 'yield' ID_COL = 'id' # Columns to drop entirely (redundant/leaky) REDUNDANT_COLS = [ 'MinOfUpperTRange', 'MaxOfUpperTRange', 'MinOfLowerTRange', 'MaxOfLowerTRange', 'AverageOfLowerTRange', 'AverageRainingDays' ] # Ordinal columns — define the order of categories # Example: {'size': ['small', 'medium', 'large']} ORDINAL_COLS = {} # Columns with more unique values than this get Label Encoded, else One-Hot OHE_MAX_CARDINALITY = 15 # ────────────────────────────────────────────── # 1. LOAD DATA # ────────────────────────────────────────────── print("1. Loading data...") Data = pd.read_csv("data/train.csv") TData = pd.read_csv("data/test.csv") # ────────────────────────────────────────────── # 2. HELPER FUNCTIONS # ────────────────────────────────────────────── def drop_columns(df, columns): return df.drop(columns=columns, errors='ignore') def fix_datetime_cols(df): """Auto-detect datetime strings and extract useful numeric features.""" for col in df.select_dtypes(include=['object', 'string']).columns: try: parsed = pd.to_datetime(df[col], infer_datetime_format=True) if parsed.notna().mean() > 0.8: # treat as datetime if 80%+ parse OK df[col + '_year'] = parsed.dt.year df[col + '_month'] = parsed.dt.month df[col + '_day'] = parsed.dt.day df[col + '_weekday'] = parsed.dt.weekday df[col + '_hour'] = parsed.dt.hour df = df.drop(columns=[col]) except Exception: pass return df def fill_nulls(df, num_medians=None, cat_modes=None, fit=False): """ Fill missing values: - Numeric → median (fit on train, transform on test) - Object → mode Returns (df, num_medians, cat_modes) when fit=True, else df. """ num_cols = df.select_dtypes(include='number').columns.tolist() cat_cols = df.select_dtypes(include=['object', 'string']).columns.tolist() if fit: num_medians = df[num_cols].median() cat_modes = {c: df[c].mode()[0] if not df[c].mode().empty else 'Unknown' for c in cat_cols} df[num_cols] = df[num_cols].fillna(num_medians) for col in cat_cols: df[col] = df[col].fillna(cat_modes.get(col, 'Unknown')) if fit: return df, num_medians, cat_modes return df def encode_features(df_train, df_test, ordinal_cols, ohe_max_cardinality): """ Smart encoding — applied consistently across train & test: • Ordinal columns → OrdinalEncoder (order-preserving) • Low-cardinality object → One-Hot Encoding • High-cardinality object → Label Encoding """ obj_cols = df_train.select_dtypes(include=['object', 'string']).columns.tolist() ord_keys = [c for c in ordinal_cols if c in df_train.columns] ohe_cols = [c for c in obj_cols if c not in ord_keys and df_train[c].nunique() <= ohe_max_cardinality] label_cols = [c for c in obj_cols if c not in ord_keys and df_train[c].nunique() > ohe_max_cardinality] print(f" Ordinal encoded : {ord_keys}") print(f" One-Hot encoded : {ohe_cols}") print(f" Label encoded : {label_cols}") # --- Ordinal Encoding --- if ord_keys: categories = [ordinal_cols[c] for c in ord_keys] oe = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1) df_train[ord_keys] = oe.fit_transform(df_train[ord_keys]) df_test[ord_keys] = oe.transform(df_test[ord_keys].astype(str)) # --- Label Encoding (high cardinality) --- for col in label_cols: le = LabelEncoder() combined = pd.concat([df_train[col], df_test[col] if col in df_test.columns else pd.Series()], axis=0).astype(str) le.fit(combined) df_train[col] = le.transform(df_train[col].astype(str)) if col in df_test.columns: df_test[col] = df_test[col].astype(str).apply( lambda x: le.transform([x])[0] if x in le.classes_ else -1 ) # --- One-Hot Encoding (low cardinality) --- df_train = pd.get_dummies(df_train, columns=ohe_cols) df_test = pd.get_dummies(df_test, columns=[c for c in ohe_cols if c in df_test.columns]) # Align columns — test may be missing some OHE columns df_test = df_test.reindex(columns=df_train.columns, fill_value=0) return df_train, df_test def create_features(df): """Add engineered features here.""" # Example: df['TRange'] = df['MaxOfUpperTRange'] - df['MinOfLowerTRange'] return df # ────────────────────────────────────────────── # 3. PREPROCESS TRAINING DATA # ────────────────────────────────────────────── print("2. Preprocessing training data...") test_ids = TData[ID_COL] df = drop_columns(Data, [ID_COL]) df_test = drop_columns(TData, [ID_COL]) # Datetime df = fix_datetime_cols(df) df_test = fix_datetime_cols(df_test) # Missing values — fit on train, apply to test df, num_medians, cat_modes = fill_nulls(df, fit=True) df_test = fill_nulls(df_test, num_medians=num_medians, cat_modes=cat_modes) # Feature engineering df = create_features(df) df_test = create_features(df_test) # Drop redundant columns df = drop_columns(df, REDUNDANT_COLS) df_test = drop_columns(df_test, REDUNDANT_COLS) # Encoding — train and test encoded together for consistency df_features = df.drop(columns=[TARGET_COL]) df_target = df[TARGET_COL] df_features, df_test = encode_features( df_features, df_test, ORDINAL_COLS, OHE_MAX_CARDINALITY ) processed_data = pd.concat([df_features, df_target], axis=1) # ────────────────────────────────────────────── # 4. SPLIT & SCALE # ────────────────────────────────────────────── print("3. Splitting and scaling...") X = processed_data.drop(columns=[TARGET_COL]) y = processed_data[TARGET_COL] X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=42 ) scaler = StandardScaler() num_cols = X_train.select_dtypes(include='number').columns X_train[num_cols] = scaler.fit_transform(X_train[num_cols]) X_val[num_cols] = scaler.transform(X_val[num_cols]) df_test[num_cols] = scaler.transform(df_test[num_cols]) train_cols = X_train.columns # ────────────────────────────────────────────── # 5. MODEL + GRIDSEARCHCV # ────────────────────────────────────────────── print("4. Training with GridSearchCV...") model_base = XGBRegressor(random_state=42, n_jobs=-1, eval_metric='mae') param_grid = { 'n_estimators' : [300, 600], 'max_depth' : [4, 6], 'learning_rate': [0.05, 0.1], 'subsample' : [0.8], 'colsample_bytree': [0.8], } grid_search = GridSearchCV( estimator = model_base, param_grid = param_grid, cv = 3, scoring = 'neg_mean_absolute_error', n_jobs = -1, verbose = 2, ) grid_search.fit(X_train, y_train) model = grid_search.best_estimator_ print(f" Best params: {grid_search.best_params_}") # ────────────────────────────────────────────── # 6. EVALUATE # ────────────────────────────────────────────── y_pred_val = model.predict(X_val) print(f"Validation MAE: {mean_absolute_error(y_val, y_pred_val):.4f}") # ────────────────────────────────────────────── # 7. PREDICT TEST DATA & SAVE # ────────────────────────────────────────────── print("5. Predicting test data...") df_test = df_test.reindex(columns=train_cols, fill_value=0) predictions = model.predict(df_test) submission = pd.DataFrame({ID_COL: test_ids, TARGET_COL: predictions}) submission.to_csv('submission.csv', index=False) print("Done! submission.csv written.")Classification
import pandas as pd import numpy as np from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.metrics import accuracy_score, classification_report from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder from sklearn.impute import SimpleImputer from xgboost import XGBClassifier TARGET_COL = 'target' # column to predict (class label) ID_COL = 'id' # set to None if no ID column # Columns to drop entirely (redundant/leaky) REDUNDANT_COLS = [] # Ordinal columns — define the order of categories # Example: {'size': ['small', 'medium', 'large']} ORDINAL_COLS = {} # Columns with more unique values than this get Label Encoded, else One-Hot OHE_MAX_CARDINALITY = 15 # ────────────────────────────────────────────── # 1. LOAD DATA # ────────────────────────────────────────────── print("1. Loading data...") Data = pd.read_csv("data/train.csv") TData = pd.read_csv("data/test.csv") print(f" Train: {Data.shape} | Test: {TData.shape}") # ────────────────────────────────────────────── # 2. HELPER FUNCTIONS # ────────────────────────────────────────────── def drop_columns(df, columns): return df.drop(columns=columns, errors='ignore') def fix_datetime_cols(df): """Auto-detect datetime strings and extract useful numeric features.""" for col in df.select_dtypes(include='object').columns: try: parsed = pd.to_datetime(df[col], infer_datetime_format=True) if parsed.notna().mean() > 0.8: df[col + '_year'] = parsed.dt.year df[col + '_month'] = parsed.dt.month df[col + '_day'] = parsed.dt.day df[col + '_weekday'] = parsed.dt.weekday df[col + '_hour'] = parsed.dt.hour df = df.drop(columns=[col]) except Exception: pass return df def fill_nulls(df, num_medians=None, cat_modes=None, fit=False): """ Fill missing values: - Numeric → median (fit on train, apply to test) - Object → mode """ num_cols = df.select_dtypes(include='number').columns.tolist() cat_cols = df.select_dtypes(include='object').columns.tolist() if fit: num_medians = df[num_cols].median() cat_modes = {c: df[c].mode()[0] if not df[c].mode().empty else 'Unknown' for c in cat_cols} df[num_cols] = df[num_cols].fillna(num_medians) for col in cat_cols: df[col] = df[col].fillna(cat_modes.get(col, 'Unknown')) if fit: return df, num_medians, cat_modes return df def encode_features(df_train, df_test, ordinal_cols, ohe_max_cardinality): """ Smart encoding — applied consistently across train & test: • Ordinal columns → OrdinalEncoder (order-preserving) • Low-cardinality object → One-Hot Encoding • High-cardinality object → Label Encoding """ obj_cols = df_train.select_dtypes(include='object').columns.tolist() ord_keys = [c for c in ordinal_cols if c in df_train.columns] ohe_cols = [c for c in obj_cols if c not in ord_keys and df_train[c].nunique() <= ohe_max_cardinality] label_cols = [c for c in obj_cols if c not in ord_keys and df_train[c].nunique() > ohe_max_cardinality] print(f" Ordinal encoded : {ord_keys}") print(f" One-Hot encoded : {ohe_cols}") print(f" Label encoded : {label_cols}") # --- Ordinal Encoding --- if ord_keys: categories = [ordinal_cols[c] for c in ord_keys] oe = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1) df_train[ord_keys] = oe.fit_transform(df_train[ord_keys]) df_test[ord_keys] = oe.transform(df_test[ord_keys].astype(str)) # --- Label Encoding (high cardinality) --- for col in label_cols: le = LabelEncoder() combined = pd.concat([df_train[col], df_test[col] if col in df_test.columns else pd.Series()], axis=0).astype(str) le.fit(combined) df_train[col] = le.transform(df_train[col].astype(str)) if col in df_test.columns: df_test[col] = df_test[col].astype(str).apply( lambda x: le.transform([x])[0] if x in le.classes_ else -1 ) # --- One-Hot Encoding (low cardinality) --- df_train = pd.get_dummies(df_train, columns=ohe_cols) df_test = pd.get_dummies(df_test, columns=[c for c in ohe_cols if c in df_test.columns]) # Align columns — test may be missing some OHE columns df_test = df_test.reindex(columns=df_train.columns, fill_value=0) return df_train, df_test def create_features(df): """Add engineered features here.""" return df # ────────────────────────────────────────────── # 3. PREPROCESS # ────────────────────────────────────────────── print("2. Preprocessing data...") test_ids = TData[ID_COL] if ID_COL and ID_COL in TData.columns else None df = drop_columns(Data, [ID_COL] if ID_COL else []) df_test = drop_columns(TData, [ID_COL] if ID_COL else []) # Datetime df = fix_datetime_cols(df) df_test = fix_datetime_cols(df_test) # Missing values — fit on train, apply to test df, num_medians, cat_modes = fill_nulls(df, fit=True) df_test = fill_nulls(df_test, num_medians=num_medians, cat_modes=cat_modes) # Feature engineering df = create_features(df) df_test = create_features(df_test) # Drop redundant columns df = drop_columns(df, REDUNDANT_COLS) df_test = drop_columns(df_test, REDUNDANT_COLS) # Encode target column — XGBoost needs numeric labels target_le = LabelEncoder() df[TARGET_COL] = target_le.fit_transform(df[TARGET_COL].astype(str)) num_classes = df[TARGET_COL].nunique() print(f" Classes: {list(target_le.classes_)} ({num_classes} total)") # Encoding features df_features = df.drop(columns=[TARGET_COL]) df_target = df[TARGET_COL] df_features, df_test = encode_features( df_features, df_test, ORDINAL_COLS, OHE_MAX_CARDINALITY ) processed_data = pd.concat([df_features, df_target], axis=1) # ────────────────────────────────────────────── # 4. SPLIT & SCALE # ────────────────────────────────────────────── print("3. Splitting and scaling...") X = processed_data.drop(columns=[TARGET_COL]) y = processed_data[TARGET_COL] X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y # stratify keeps class balance ) scaler = StandardScaler() num_cols = X_train.select_dtypes(include='number').columns X_train[num_cols] = scaler.fit_transform(X_train[num_cols]) X_val[num_cols] = scaler.transform(X_val[num_cols]) df_test[num_cols] = scaler.transform(df_test[num_cols]) train_cols = X_train.columns # ────────────────────────────────────────────── # 5. MODEL + GRIDSEARCHCV # ────────────────────────────────────────────── print("4. Training with GridSearchCV...") model_base = XGBClassifier( random_state=42, n_jobs=-1, eval_metric='mlogloss', # multi-class log loss use_label_encoder=False, num_class=num_classes if num_classes > 2 else None, objective='multi:softmax' if num_classes > 2 else 'binary:logistic', ) param_grid = { 'n_estimators' : [300, 600], 'max_depth' : [4, 6], 'learning_rate': [0.05, 0.1], 'subsample' : [0.8], 'colsample_bytree': [0.8], } grid_search = GridSearchCV( estimator = model_base, param_grid = param_grid, cv = 3, scoring = 'accuracy', n_jobs = -1, verbose = 2, ) grid_search.fit(X_train, y_train) model = grid_search.best_estimator_ print(f" Best params: {grid_search.best_params_}") # ────────────────────────────────────────────── # 6. EVALUATE # ────────────────────────────────────────────── y_pred_val = model.predict(X_val) print(f"\nValidation Accuracy: {accuracy_score(y_val, y_pred_val):.4f}") print("\nClassification Report:") print(classification_report(y_val, y_pred_val, target_names=target_le.classes_)) # ────────────────────────────────────────────── # 7. PREDICT TEST DATA & SAVE # ────────────────────────────────────────────── print("5. Predicting test data...") df_test = df_test.reindex(columns=train_cols, fill_value=0) pred_encoded = model.predict(df_test) predictions = target_le.inverse_transform(pred_encoded) # decode back to original labels if test_ids is not None: submission = pd.DataFrame({ID_COL: test_ids, TARGET_COL: predictions}) else: submission = pd.DataFrame({TARGET_COL: predictions}) submission.to_csv('submission.csv', index=False) print("Done! submission.csv written.")ML PIPELINE — HOW TO USE (Quick Guide)Run all scripts from the PROJECT ROOT:
cd /path/to/MLHack
python src/<filename>.py
════════════════════════════════════════
1. regression.py (predict a number)
════════════════════════════════════════
WHEN TO USE: target column is continuous (e.g. price, yield, score)
EDIT THESE at the top of the file:
TARGET_COL → name of the column you want to predict
ID_COL → name of the ID column (or None)
REDUNDANT_COLS → list of columns to drop (leaky/useless)
ORDINAL_COLS → dict of ordered categorical cols
e.g. {'size': ['small','medium','large']}
METRIC reported: MAE (lower = better)
TO IMPROVE SCORE:
• Add domain features inside create_features(df)
• Increase param_grid values (more compute)
• Lower OHE_MAX_CARDINALITY if too many dummies
════════════════════════════════════════
2. classification.py (predict a class)
════════════════════════════════════════
WHEN TO USE: target is a label (e.g. yes/no, cat/dog, spam/ham)
EDIT THESE at the top of the file:
TARGET_COL → class label column name
ID_COL → ID column (or None)
REDUNDANT_COLS → columns to drop
ORDINAL_COLS → ordered categorical columns
METRICS reported: Accuracy + Classification Report
(precision, recall, f1 per class)
IMPORTANT — avoid overfitting:
• Validation is evaluated on HELD-OUT X_val
(the model was trained on X_train only during GridSearchCV)
• Final predictions use model retrained on ALL data
• Check CV score vs val score — if val >> CV, it may overfit
TO IMPROVE SCORE:
• Add interaction terms in create_features(df)
• Use stratify=y in train_test_split (already done)
• Balance classes with class_weight if dataset is skewed
════════════════════════════════════════
3. knn.py (K-Nearest Neighbours)
════════════════════════════════════════
WHEN TO USE: classification with smaller datasets;
interpretable & no training required
EDIT THESE at the top of the file:
TARGET_COL → class label column name
ID_COL → ID column (or None)
REDUNDANT_COLS → columns to drop
ORDINAL_COLS → ordered categorical columns
METRICS reported: Accuracy + Classification Report
CRITICAL NOTES:
• Scaling is MANDATORY for KNN (already done via StandardScaler)
• KNN struggles with high-dimensional data (many columns)
→ If you have >50 features, prefer classification.py instead
• GridSearchCV tunes: n_neighbors, weights, metric
TO IMPROVE SCORE:
• Remove irrelevant features — KNN is sensitive to noise
• Try PCA if you have many features (dimensionality reduction)
• Use weights='distance' for imbalanced nearest neighbours
════════════════════════════════════════
COMMON STEPS FOR ALL 3 FILES
════════════════════════════════════════
Step 1 — Set config vars at top of file
Step 2 — Add custom features in create_features(df) if you know the domain
Step 3 — Add column orders to ORDINAL_COLS if any columns have natural order
Step 4 — List junk/leaky columns in REDUNDANT_COLS
Step 5 — Run the script
Step 6 — Upload submission.csv
Clustering
import pandas as pd import numpy as np from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.metrics import accuracy_score, classification_report from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder from sklearn.neighbors import KNeighborsClassifier TARGET_COL = 'target' # column to predict (class label) ID_COL = 'id' # set to None if no ID column # Columns to drop entirely (redundant/leaky) REDUNDANT_COLS = [] # Ordinal columns — define the order of categories # Example: {'size': ['small', 'medium', 'large']} ORDINAL_COLS = {} # Columns with more unique values than this get Label Encoded, else One-Hot OHE_MAX_CARDINALITY = 15 # ────────────────────────────────────────────── # 1. LOAD DATA # ────────────────────────────────────────────── print("1. Loading data...") Data = pd.read_csv("data/train.csv") TData = pd.read_csv("data/test.csv") print(f" Train: {Data.shape} | Test: {TData.shape}") # ────────────────────────────────────────────── # 2. HELPER FUNCTIONS # ────────────────────────────────────────────── def drop_columns(df, columns): return df.drop(columns=columns, errors='ignore') def fix_datetime_cols(df): """Auto-detect datetime strings and extract useful numeric features.""" for col in df.select_dtypes(include=['object', 'string']).columns: try: parsed = pd.to_datetime(df[col], infer_datetime_format=True) if parsed.notna().mean() > 0.8: df[col + '_year'] = parsed.dt.year df[col + '_month'] = parsed.dt.month df[col + '_day'] = parsed.dt.day df[col + '_weekday'] = parsed.dt.weekday df[col + '_hour'] = parsed.dt.hour df = df.drop(columns=[col]) except Exception: pass return df def fill_nulls(df, num_medians=None, cat_modes=None, fit=False): """ Fill missing values: - Numeric → median (fit on train, apply to test) - Object → mode """ num_cols = df.select_dtypes(include='number').columns.tolist() cat_cols = df.select_dtypes(include=['object', 'string']).columns.tolist() if fit: num_medians = df[num_cols].median() cat_modes = {c: df[c].mode()[0] if not df[c].mode().empty else 'Unknown' for c in cat_cols} df[num_cols] = df[num_cols].fillna(num_medians) for col in cat_cols: df[col] = df[col].fillna(cat_modes.get(col, 'Unknown')) if fit: return df, num_medians, cat_modes return df def encode_features(df_train, df_test, ordinal_cols, ohe_max_cardinality): """ Smart encoding — applied consistently across train & test: • Ordinal columns → OrdinalEncoder (order-preserving) • Low-cardinality object → One-Hot Encoding • High-cardinality object → Label Encoding """ obj_cols = df_train.select_dtypes(include=['object', 'string']).columns.tolist() ord_keys = [c for c in ordinal_cols if c in df_train.columns] ohe_cols = [c for c in obj_cols if c not in ord_keys and df_train[c].nunique() <= ohe_max_cardinality] label_cols = [c for c in obj_cols if c not in ord_keys and df_train[c].nunique() > ohe_max_cardinality] print(f" Ordinal encoded : {ord_keys}") print(f" One-Hot encoded : {ohe_cols}") print(f" Label encoded : {label_cols}") # --- Ordinal Encoding --- if ord_keys: categories = [ordinal_cols[c] for c in ord_keys] oe = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1) df_train[ord_keys] = oe.fit_transform(df_train[ord_keys]) df_test[ord_keys] = oe.transform(df_test[ord_keys].astype(str)) # --- Label Encoding (high cardinality) --- for col in label_cols: le = LabelEncoder() combined = pd.concat([df_train[col], df_test[col] if col in df_test.columns else pd.Series()], axis=0).astype(str) le.fit(combined) df_train[col] = le.transform(df_train[col].astype(str)) if col in df_test.columns: df_test[col] = df_test[col].astype(str).apply( lambda x: le.transform([x])[0] if x in le.classes_ else -1 ) # --- One-Hot Encoding (low cardinality) --- df_train = pd.get_dummies(df_train, columns=ohe_cols) df_test = pd.get_dummies(df_test, columns=[c for c in ohe_cols if c in df_test.columns]) # Align columns — test may be missing some OHE columns df_test = df_test.reindex(columns=df_train.columns, fill_value=0) return df_train, df_test def create_features(df): """Add engineered features here.""" return df # ────────────────────────────────────────────── # 3. PREPROCESS # ────────────────────────────────────────────── print("2. Preprocessing data...") test_ids = TData[ID_COL] if ID_COL and ID_COL in TData.columns else None df = drop_columns(Data, [ID_COL] if ID_COL else []) df_test = drop_columns(TData, [ID_COL] if ID_COL else []) # Datetime df = fix_datetime_cols(df) df_test = fix_datetime_cols(df_test) # Missing values — fit on train, apply to test df, num_medians, cat_modes = fill_nulls(df, fit=True) df_test = fill_nulls(df_test, num_medians=num_medians, cat_modes=cat_modes) # Feature engineering df = create_features(df) df_test = create_features(df_test) # Drop redundant columns df = drop_columns(df, REDUNDANT_COLS) df_test = drop_columns(df_test, REDUNDANT_COLS) # Encode target column target_le = LabelEncoder() df[TARGET_COL] = target_le.fit_transform(df[TARGET_COL].astype(str)) print(f" Classes: {list(target_le.classes_)}") # Encoding features df_features = df.drop(columns=[TARGET_COL]) df_target = df[TARGET_COL] df_features, df_test = encode_features( df_features, df_test, ORDINAL_COLS, OHE_MAX_CARDINALITY ) processed_data = pd.concat([df_features, df_target], axis=1) # ────────────────────────────────────────────── # 4. SPLIT & SCALE # ────────────────────────────────────────────── # NOTE: Scaling is CRITICAL for KNN — it is distance-based, so all # features must be on the same scale to avoid bias. print("3. Splitting and scaling...") X = processed_data.drop(columns=[TARGET_COL]) y = processed_data[TARGET_COL] X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) scaler = StandardScaler() num_cols = X_train.select_dtypes(include='number').columns X_train[num_cols] = scaler.fit_transform(X_train[num_cols]) X_val[num_cols] = scaler.transform(X_val[num_cols]) df_test[num_cols] = scaler.transform(df_test[num_cols]) train_cols = X_train.columns # ────────────────────────────────────────────── # 5. MODEL + GRIDSEARCHCV # ────────────────────────────────────────────── print("4. Training KNN with GridSearchCV...") model_base = KNeighborsClassifier(n_jobs=-1) param_grid = { 'n_neighbors': [3, 5, 7, 11, 15], # number of neighbours to consider 'weights' : ['uniform', 'distance'], # uniform = all equal, distance = closer = more weight 'metric' : ['euclidean', 'manhattan'], # distance metric } grid_search = GridSearchCV( estimator = model_base, param_grid = param_grid, cv = 3, scoring = 'accuracy', n_jobs = -1, verbose = 2, ) grid_search.fit(X_train, y_train) model = grid_search.best_estimator_ print(f" Best params: {grid_search.best_params_}") # ────────────────────────────────────────────── # 6. EVALUATE # ────────────────────────────────────────────── y_pred_val = model.predict(X_val) print(f"\nValidation Accuracy: {accuracy_score(y_val, y_pred_val):.4f}") print("\nClassification Report:") print(classification_report(y_val, y_pred_val, target_names=target_le.classes_)) # ────────────────────────────────────────────── # 7. PREDICT TEST DATA & SAVE # ────────────────────────────────────────────── print("5. Predicting test data...") df_test = df_test.reindex(columns=train_cols, fill_value=0) pred_encoded = model.predict(df_test) predictions = target_le.inverse_transform(pred_encoded) # decode back to original labels if test_ids is not None: submission = pd.DataFrame({ID_COL: test_ids, TARGET_COL: predictions}) else: submission = pd.DataFrame({TARGET_COL: predictions}) submission.to_csv('submission.csv', index=False) print("Done! submission.csv written.")