import pandas as pd
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
train = train.drop('ID', axis='columns')
test = test.drop('ID', axis='columns')
TARGET_FEATURE = train.columns.difference(test.columns) #'Yield'
import xgboost
import lightgbm
import numpy as np
import catboost
from sklearn.pipeline import *
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import KFold, cross_validate
from datetime import *
def clean(df, training=1):
harv = pd.to_datetime(df['Harv_date'], errors='coerce')
seed = pd.to_datetime(df['SeedingSowingTransplanting'], errors='coerce')
df['TotalCropDuration'] = (harv - seed).dt.days.fillna(-1).astype(float)
df['Irrigation_Density'] = df['TransplantingIrrigationHours'] / (df['Acre'] + 1e-5)
df['Cultivation_Intensity'] = df['CropCultLand'] / (df['CultLand'] + 1e-5)
df['Irrigation_Density'] = df['TransplantingIrrigationHours'] / (df['Acre'] + 1e-5)
df['Cost_Per_Acre'] = df['TransIrriCost'] / (df['Acre'] + 1e-5)
df['Total_Urea'] = df[['BasalUrea', '1tdUrea', '2tdUrea']].sum(axis=1)
df['Total_Basal'] = df[['BasalDAP', 'BasalUrea']].sum(axis=1)
df['Total_Fertilizer'] = df['Total_Urea'] + df['BasalDAP'] + df['Ganaura'] + df['CropOrgFYM']
df['Nutrient_Density'] = df['Total_Fertilizer'] / (df['Acre'] + 1e-5)
for i in df.columns:
#print(i, df[i].dtype)
if df[i].dtype=='object':
df[i] = df[i].fillna('Missing').astype('category')
if not training:
for col in df.select_dtypes('category'):
df[col] = df[col].astype(train[col].dtype)
return df
train = clean(train, training=1)
test = clean(test, training=0)
X_train = train.drop(TARGET_FEATURE, axis='columns')
y_train = train[TARGET_FEATURE]
X_test = test
cat_features = []
for i in test.columns:
if test[i].dtype=='category':
cat_features.append(i)
#model = xgboost.XGBRegressor(n_estimators=337, enable_categorical=True)
model = (lightgbm.LGBMRegressor())
#model = catboost.CatBoostRegressor(loss_function='RMSE', cat_features=tuple(cat_features), iterations=3000, verbose=100, l2_leaf_reg=24)
kf = KFold(n_splits=5, shuffle=True, random_state=88301)
results = cross_validate(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=kf)
print(results)
print(-results['test_score'].mean())
model.fit(X_train, y_train)
predictions = model.predict(X_test)
final_sub = pd.read_csv("SampleSubmission.csv")
final_sub['Yield_RMSE'] = predictions
final_sub['Yield_MAE'] = predictions
final_sub.to_csv('XGAtt1.csv')
input_dim = X_train.shape[1]
encoding_dim = 8
input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(16, activation='relu')(encoded)
bottleneck = Dense(encoding_dim, activation='relu', name='bottleneck')(encoded)
decoded = Dense(16, activation='relu')(bottleneck)
decoded = Dense(32, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='linear')(decoded)
autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, validation_data=(X_val, X_val))
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('bottleneck').output)
X_train_latent = encoder.predict(X_train)
this is my code and I wanted to ask if this is optimal Stratified splitting requires at least 2 samples per class, because it must put at least one sample in both train and test sets.
General Parameters are responsible for defining the overall functionality of the XGBoost model. These parameters include decisions like which booster (tree, linear model, etc.) to use, how much verbosity to allow during training, or if GPU should be used. Eg: booster='gbtree', verbosity=1
Booster Parameters define how each boosting round should be performed. These include parameters specific to tree-based models such as max_depth, min_child_weight, and subsample. They determine the complexity of each tree, the regularization strategy, and how data is sampled. Eg: max_depth=6, subsample=0.8, colsample_bytree=0.7
Learning Task Parameters are used to define the loss function and the objective of the model. These parameters determine what type of learning task you are solving (e.g., binary classification, multi-class classification, regression) and how performance is evaluated during training. Eg: objective='binary:logistic', eval_metric='auc'