We will train an XGBoost model on the Adult's Income dataset and deploy it on Hugging Face spaces.

In [1]:
!pip install xgboost==1.6.2
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: xgboost==1.6.2 in /home/bom/.local/lib/python3.10/site-packages (1.6.2)
Requirement already satisfied: numpy in /home/bom/.local/lib/python3.10/site-packages (from xgboost==1.6.2) (1.23.4)
Requirement already satisfied: scipy in /home/bom/.local/lib/python3.10/site-packages (from xgboost==1.6.2) (1.9.2)
In [2]:
!wget http://www.donlapark.cmustat.com/Income.csv
--2022-10-25 14:47:43--  http://www.donlapark.cmustat.com/Income.csv
Resolving www.donlapark.cmustat.com (www.donlapark.cmustat.com)... 150.107.31.67
Connecting to www.donlapark.cmustat.com (www.donlapark.cmustat.com)|150.107.31.67|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3069744 (2.9M) [text/csv]
Saving to: ‘Income.csv’

Income.csv          100%[===================>]   2.93M  4.91MB/s    in 0.6s    

2022-10-25 14:47:43 (4.91 MB/s) - ‘Income.csv’ saved [3069744/3069744]

In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from xgboost import XGBClassifier


EDU_DICT = {'Preschool': 1,
            '1st-4th': 2,
            '5th-6th': 3,
            '7th-8th': 4,
            '9th': 5,
            '10th': 6,
            '11th': 7,
            '12th': 8,
            'HS-grad': 9, 
            'Some-college': 10,
            'Assoc-voc': 11,
            'Assoc-acdm': 12,
            'Bachelors': 13,
            'Masters': 14,
            'Prof-school': 15,
            'Doctorate': 16
            }


X_train = pd.read_csv('Income.csv')

X_train
Out[3]:
age workclass education marital.status occupation relationship race sex hours.per.week native.country income
0 90 ? HS-grad Widowed ? Not-in-family White Female 40 United-States <=50K
1 82 Private HS-grad Widowed Exec-managerial Not-in-family White Female 18 United-States <=50K
2 66 ? Some-college Widowed ? Unmarried Black Female 40 United-States <=50K
3 54 Private 7th-8th Divorced Machine-op-inspct Unmarried White Female 40 United-States <=50K
4 41 Private Some-college Separated Prof-specialty Own-child White Female 40 United-States <=50K
... ... ... ... ... ... ... ... ... ... ... ...
32556 22 Private Some-college Never-married Protective-serv Not-in-family White Male 40 United-States <=50K
32557 27 Private Assoc-acdm Married-civ-spouse Tech-support Wife White Female 38 United-States <=50K
32558 40 Private HS-grad Married-civ-spouse Machine-op-inspct Husband White Male 40 United-States >50K
32559 58 Private HS-grad Widowed Adm-clerical Unmarried White Female 40 United-States <=50K
32560 22 Private HS-grad Never-married Adm-clerical Own-child White Male 20 United-States <=50K

32561 rows × 11 columns

In [4]:
y_train = X_train.pop("income")
y_train = (y_train == ">50K").astype(int)
X_train['education'].replace(EDU_DICT, inplace=True)

# Names of numerical features
num_col = X_train.select_dtypes(include=['int64', 'float64']).columns
# Names of categorical features
cat_col = X_train.select_dtypes(include=['object', 'bool']).columns

print(num_col)
print(cat_col)
Index(['age', 'education', 'hours.per.week'], dtype='object')
Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')
In [5]:
preprocessor = ColumnTransformer([("scaler", StandardScaler(), num_col), 
                                  ("onehot", OneHotEncoder(sparse=False), cat_col)])

model = Pipeline(steps=[('preprocessor', preprocessor), 
                        ('classifier', XGBClassifier())])  

model.fit(X_train, y_train)
Out[5]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  Index(['age', 'education', 'hours.per.week'], dtype='object')),
                                                 ('onehot',
                                                  OneHotEncoder(sparse=False),
                                                  Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object'))])),
                ('classifier',
                 XGBClassifier(base_score=0.5, boo...
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=6, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_state=0, reg_alpha=0,
                               reg_lambda=1, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                                  Index(['age', 'education', 'hours.per.week'], dtype='object')),
                                                 ('onehot',
                                                  OneHotEncoder(sparse=False),
                                                  Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object'))])),
                ('classifier',
                 XGBClassifier(base_score=0.5, boo...
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=6, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_state=0, reg_alpha=0,
                               reg_lambda=1, ...))])
ColumnTransformer(transformers=[('scaler', StandardScaler(),
                                 Index(['age', 'education', 'hours.per.week'], dtype='object')),
                                ('onehot', OneHotEncoder(sparse=False),
                                 Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object'))])
Index(['age', 'education', 'hours.per.week'], dtype='object')
StandardScaler()
Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')
OneHotEncoder(sparse=False)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

Saving the model¶

In [6]:
import joblib

joblib.dump(model, 'model.joblib')
Out[6]:
['model.joblib']
In [7]:
unique_values = {col:X_train[col].unique() for col in cat_col}
unique_values['education'] = list(EDU_DICT.keys())

joblib.dump(unique_values, 'unique_values.joblib')
Out[7]:
['unique_values.joblib']
In [8]:
unique_values
Out[8]:
{'workclass': array(['?', 'Private', 'State-gov', 'Federal-gov', 'Self-emp-not-inc',
        'Self-emp-inc', 'Local-gov', 'Without-pay', 'Never-worked'],
       dtype=object),
 'marital.status': array(['Widowed', 'Divorced', 'Separated', 'Never-married',
        'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'],
       dtype=object),
 'occupation': array(['?', 'Exec-managerial', 'Machine-op-inspct', 'Prof-specialty',
        'Other-service', 'Adm-clerical', 'Craft-repair',
        'Transport-moving', 'Handlers-cleaners', 'Sales',
        'Farming-fishing', 'Tech-support', 'Protective-serv',
        'Armed-Forces', 'Priv-house-serv'], dtype=object),
 'relationship': array(['Not-in-family', 'Unmarried', 'Own-child', 'Other-relative',
        'Husband', 'Wife'], dtype=object),
 'race': array(['White', 'Black', 'Asian-Pac-Islander', 'Other',
        'Amer-Indian-Eskimo'], dtype=object),
 'sex': array(['Female', 'Male'], dtype=object),
 'native.country': array(['United-States', '?', 'Mexico', 'Greece', 'Vietnam', 'China',
        'Taiwan', 'India', 'Philippines', 'Trinadad&Tobago', 'Canada',
        'South', 'Holand-Netherlands', 'Puerto-Rico', 'Poland', 'Iran',
        'England', 'Germany', 'Italy', 'Japan', 'Hong', 'Honduras', 'Cuba',
        'Ireland', 'Cambodia', 'Peru', 'Nicaragua', 'Dominican-Republic',
        'Haiti', 'El-Salvador', 'Hungary', 'Columbia', 'Guatemala',
        'Jamaica', 'Ecuador', 'France', 'Yugoslavia', 'Scotland',
        'Portugal', 'Laos', 'Thailand', 'Outlying-US(Guam-USVI-etc)'],
       dtype=object),
 'education': ['Preschool',
  '1st-4th',
  '5th-6th',
  '7th-8th',
  '9th',
  '10th',
  '11th',
  '12th',
  'HS-grad',
  'Some-college',
  'Assoc-voc',
  'Assoc-acdm',
  'Bachelors',
  'Masters',
  'Prof-school',
  'Doctorate']}