Code
# Importing Libraries
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
STAT 303-3
# Importing Libraries
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from catboost import CatBoostClassifier
# Reading the data
= pd.read_csv("train_clas.csv")
train = pd.read_csv("test_clas.csv") test
# Target
= train["host_is_superhost"]
y_train
# Dropping features
= train.drop(columns=["id", "host_is_superhost","description","host_about"])
X_train = test.drop(columns=["id","description","host_about"]) X_test
# Cleaning the data
for col in ['host_response_rate', 'host_acceptance_rate']:
= X_train[col].str.rstrip('%').astype(float)
X_train[col] = X_test[col].str.rstrip('%').astype(float)
X_test[col]
def extract_bathroom_count(x):
try:
return float(x.split(' ')[0])
except:
return None
'bathrooms_text'] = X_train['bathrooms_text'].apply(extract_bathroom_count)
X_train['bathrooms_text'] = X_test['bathrooms_text'].apply(extract_bathroom_count) X_test[
# Filling missing data
# Numerical features
= X_train.select_dtypes(include='number').columns
num_cols
for col in num_cols:
= X_train[col].fillna(X_train[col].mean())
X_train[col] = X_test[col].fillna(X_train[col].mean())
X_test[col]
# Categorical features
= X_train.select_dtypes(exclude='number').columns.tolist()
cat_features
for col in cat_features:
= X_train[col].astype(str).fillna("missing")
X_train[col] = X_test[col].astype(str).fillna("missing")
X_test[col]
# used for fitting the model
= [X_train.columns.get_loc(col) for col in cat_features] cat_feature_indices
# Training and fitting the model
= CatBoostClassifier(
model = 1,
random_state = 500,
n_estimators = 0.05,
learning_rate = 8,
max_depth = 0.75,
subsample = 1,
reg_lambda = False,
verbose = 5,
scale_pos_weight= 1
thread_count
)
=cat_feature_indices)
model.fit(X_train, y_train, cat_features
= model.predict_proba(X_test)[:, 1] test_probs
# Creating the submission file
= pd.DataFrame({'id': test['id'],'predicted': test_probs})
submission_classification 'submission_classification.csv', index=False) submission_classification.to_csv(