We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces.
!pip install xgboost==1.6.2
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: xgboost==1.6.2 in /home/bom/.local/lib/python3.10/site-packages (1.6.2) Requirement already satisfied: numpy in /home/bom/.local/lib/python3.10/site-packages (from xgboost==1.6.2) (1.23.4) Requirement already satisfied: scipy in /home/bom/.local/lib/python3.10/site-packages (from xgboost==1.6.2) (1.9.2)
!wget http://www.donlapark.cmustat.com/Income.csv
--2022-10-25 14:47:43-- http://www.donlapark.cmustat.com/Income.csv Resolving www.donlapark.cmustat.com (www.donlapark.cmustat.com)... 150.107.31.67 Connecting to www.donlapark.cmustat.com (www.donlapark.cmustat.com)|150.107.31.67|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 3069744 (2.9M) [text/csv] Saving to: ‘Income.csv’ Income.csv 100%[===================>] 2.93M 4.91MB/s in 0.6s 2022-10-25 14:47:43 (4.91 MB/s) - ‘Income.csv’ saved [3069744/3069744]
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
EDU_DICT = {'Preschool': 1,
'1st-4th': 2,
'5th-6th': 3,
'7th-8th': 4,
'9th': 5,
'10th': 6,
'11th': 7,
'12th': 8,
'HS-grad': 9,
'Some-college': 10,
'Assoc-voc': 11,
'Assoc-acdm': 12,
'Bachelors': 13,
'Masters': 14,
'Prof-school': 15,
'Doctorate': 16
}
X_train = pd.read_csv('Income.csv')
X_train
age | workclass | education | marital.status | occupation | relationship | race | sex | hours.per.week | native.country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 90 | ? | HS-grad | Widowed | ? | Not-in-family | White | Female | 40 | United-States | <=50K |
1 | 82 | Private | HS-grad | Widowed | Exec-managerial | Not-in-family | White | Female | 18 | United-States | <=50K |
2 | 66 | ? | Some-college | Widowed | ? | Unmarried | Black | Female | 40 | United-States | <=50K |
3 | 54 | Private | 7th-8th | Divorced | Machine-op-inspct | Unmarried | White | Female | 40 | United-States | <=50K |
4 | 41 | Private | Some-college | Separated | Prof-specialty | Own-child | White | Female | 40 | United-States | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | 22 | Private | Some-college | Never-married | Protective-serv | Not-in-family | White | Male | 40 | United-States | <=50K |
32557 | 27 | Private | Assoc-acdm | Married-civ-spouse | Tech-support | Wife | White | Female | 38 | United-States | <=50K |
32558 | 40 | Private | HS-grad | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 40 | United-States | >50K |
32559 | 58 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | Female | 40 | United-States | <=50K |
32560 | 22 | Private | HS-grad | Never-married | Adm-clerical | Own-child | White | Male | 20 | United-States | <=50K |
32561 rows × 11 columns
y_train = X_train.pop("income")
y_train = (y_train == ">50K").astype(int)
X_train['education'].replace(EDU_DICT, inplace=True)
# Names of numerical features
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
# Names of categorical features
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns
print(num_col)
print(cat_col)
Index(['age', 'education', 'hours.per.week'], dtype='object') Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'], dtype='object')
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col),
("onehot", OneHotEncoder(sparse=False), cat_col)])
model = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', XGBClassifier())])
model.fit(X_train, y_train)
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('scaler', StandardScaler(), Index(['age', 'education', 'hours.per.week'], dtype='object')), ('onehot', OneHotEncoder(sparse=False), Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'], dtype='object'))])), ('classifier', XGBClassifier(base_score=0.5, boo... gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('scaler', StandardScaler(), Index(['age', 'education', 'hours.per.week'], dtype='object')), ('onehot', OneHotEncoder(sparse=False), Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'], dtype='object'))])), ('classifier', XGBClassifier(base_score=0.5, boo... gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1, ...))])
ColumnTransformer(transformers=[('scaler', StandardScaler(), Index(['age', 'education', 'hours.per.week'], dtype='object')), ('onehot', OneHotEncoder(sparse=False), Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'], dtype='object'))])
Index(['age', 'education', 'hours.per.week'], dtype='object')
StandardScaler()
Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country'], dtype='object')
OneHotEncoder(sparse=False)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1, ...)
import joblib
joblib.dump(model, 'model.joblib')
['model.joblib']
unique_values = {col:X_train[col].unique() for col in cat_col}
unique_values['education'] = list(EDU_DICT.keys())
joblib.dump(unique_values, 'unique_values.joblib')
['unique_values.joblib']
unique_values
{'workclass': array(['?', 'Private', 'State-gov', 'Federal-gov', 'Self-emp-not-inc', 'Self-emp-inc', 'Local-gov', 'Without-pay', 'Never-worked'], dtype=object), 'marital.status': array(['Widowed', 'Divorced', 'Separated', 'Never-married', 'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'], dtype=object), 'occupation': array(['?', 'Exec-managerial', 'Machine-op-inspct', 'Prof-specialty', 'Other-service', 'Adm-clerical', 'Craft-repair', 'Transport-moving', 'Handlers-cleaners', 'Sales', 'Farming-fishing', 'Tech-support', 'Protective-serv', 'Armed-Forces', 'Priv-house-serv'], dtype=object), 'relationship': array(['Not-in-family', 'Unmarried', 'Own-child', 'Other-relative', 'Husband', 'Wife'], dtype=object), 'race': array(['White', 'Black', 'Asian-Pac-Islander', 'Other', 'Amer-Indian-Eskimo'], dtype=object), 'sex': array(['Female', 'Male'], dtype=object), 'native.country': array(['United-States', '?', 'Mexico', 'Greece', 'Vietnam', 'China', 'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada', 'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran', 'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba', 'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic', 'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala', 'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland', 'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'], dtype=object), 'education': ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', 'HS-grad', 'Some-college', 'Assoc-voc', 'Assoc-acdm', 'Bachelors', 'Masters', 'Prof-school', 'Doctorate']}