Below is my code. My current accuracy is 77%
import pandas as pd
import numpy as np
from sklearn import metrics
import warnings
warnings.simplefilter(‘ignore’)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb
from datetime import datetime, timedelta
Loading the data
bpi_challenge_2012 = pd.read_csv("/content/drive/MyDrive/DATASET/bpi_2012_processed_df.csv")
sampled_df = bpi_challenge_2012.sample(n=262200, random_state=101)
bpi_challenge_2012 = sampled_df.copy()
bpi_challenge_2012.dropna(axis=0, inplace=True)
bpi_challenge_2012.drop([‘Unnamed: 0’], axis=1, inplace=True)
date manipulation
bpi_challenge_2012[“time_timestamp”] = pd.to_datetime(bpi_challenge_2012[“time_timestamp”]).dt.strftime("%Y%m%d")
bpi_challenge_2012[“case_REG_DATE”] = pd.to_datetime(bpi_challenge_2012[“case_REG_DATE”]).dt.strftime("%Y%m%d")
bpi_challenge_2012.dtypes
Rename columns
column_mapping = {
'true_activity': 'activity_transformed',
'W_Afhandelen_leads': 'W_Handle_leads',
'W_Valideren_aanvraag': 'W_Validate_request',
'W_Beoordelen_fraude': 'W_Assess_fraud',
'W_Wijzigen_contractgegeven': 'W_Change_contract_data',
'W_Nabellen_offertes': 'W_Nabellen_quotes'
}
bpi_challenge_2012.rename(mapper=column_mapping, axis=‘columns’, inplace=True)
Factorize categorical features
for column in [‘lifecycle_transition’, ‘concept_name’, ‘weekday’, ‘saturday’, ‘sunday’, ‘activity_transformed’,
'tracePosition', 'time_timestamp', 'case_REG_DATE']:
bpi_challenge_2012[column] = pd.factorize(bpi_challenge_2012[column])[0]
Split the data
seed = 117
train, test = train_test_split(bpi_challenge_2012, test_size=0.2, random_state=seed)
Define numerical features
numerical_features = [‘org_resource’, ‘tracePosition’, ‘W_Assess_fraud’, ‘W_Completeren_aanvraag’, ‘concept_name’,
'lifecycle_transition', 'dayOfWeek', 'case_AMOUNT_REQ', 'EventID', 'weekday', 'sunday',
'W_Nabellen_incomplete_dossiers', 'time_timestamp', 'case_REG_DATE']
Standardize numerical features
scaler = StandardScaler()
train[numerical_features] = scaler.fit_transform(train[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])
Define features and target variable
x, y = train.drop(‘activity_transformed’, axis=1), train[‘activity_transformed’]
X_train, X_test, y_train, y_test = train_test_split(x, y)
Hyperparameter Randomized Search
param_dist = {
'learning_rate': [0.1, 0.2, 0.3],
'max_depth': [5, 7, 10],
'n_estimators': [50, 100, 200],
'colsample_bytree': [0.3, 0.5, 0.7],
'subsample': [0.8, 0.9, 1.0]
}
xg_clf = xgb.XGBClassifier(objective=‘multi:softmax’, num_class=len(train[‘activity_transformed’].unique()), verbosity=2)
random_search = RandomizedSearchCV(estimator=xg_clf, param_distributions=param_dist, scoring=‘accuracy’, n_iter=10, cv=3, verbose=2)
random_search.fit(X_train, y_train)
Print the best parameters
print(“Best Hyperparameters:”, random_search.best_params_)
Initialize XGBoost classifier with best parameters
best_params = random_search.best_params_
xg_clf = xgb.XGBClassifier(objective=‘multi:softmax’, num_class=len(train[‘activity_transformed’].unique()), verbosity=2, **best_params)
Fitting the xgb model classifier
xg_clf.fit(X_train, y_train)
and then we make predictions on the test set
y_test_pred = xg_clf.predict(X_test)
So let’s Cal & print our accuracy and F1 score
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred, average=‘weighted’)
precision_ = metrics.precision_score(y_test, y_test_pred, pos_label=‘positive’, average=‘micro’)
recall = metrics.recall_score(y_test, y_test_pred, pos_label=‘positive’, average=‘micro’)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1:.2f}")
print(f"precision score: {precision_:.2f}")
print("Recall: ", recall)