Skip to content

Commit e3aa09a

Browse files
authored
Merge pull request #1841 from Giskard-AI/GSK-2604
GSK-2604 GSK-2766 Renamed tittle of wage classification notebook
2 parents 8133869 + 6f20371 commit e3aa09a

2 files changed

Lines changed: 13 additions & 12 deletions

File tree

docs/reference/notebooks/wage_classification.ipynb

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"collapsed": false
77
},
88
"source": [
9-
"# German credit scoring [scikit-learn]\n",
9+
"# Wage classification [scikit-learn]\n",
1010
"\n",
1111
"Giskard is an open-source framework for testing all ML models, from LLMs to tabular models. Don’t hesitate to give the project a [star on GitHub](https://github.com/Giskard-AI/giskard) ⭐️ if you find it useful!\n",
1212
"\n",
@@ -75,11 +75,11 @@
7575
"from urllib.request import urlretrieve\n",
7676
"\n",
7777
"import pandas as pd\n",
78-
"from sklearn.pipeline import Pipeline\n",
79-
"from sklearn.metrics import accuracy_score\n",
8078
"from sklearn.compose import ColumnTransformer\n",
8179
"from sklearn.ensemble import RandomForestClassifier\n",
80+
"from sklearn.metrics import accuracy_score\n",
8281
"from sklearn.model_selection import train_test_split\n",
82+
"from sklearn.pipeline import Pipeline\n",
8383
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
8484
"\n",
8585
"from giskard import Model, Dataset, scan, testing, GiskardClient, Suite"
@@ -111,8 +111,8 @@
111111
"TEST_RATIO = 0.2\n",
112112
"\n",
113113
"DROP_FEATURES = [\n",
114-
" 'education', \n",
115-
" 'native-country', \n",
114+
" 'education',\n",
115+
" 'native-country',\n",
116116
" 'occupation',\n",
117117
" 'marital-status',\n",
118118
" 'educational-num'\n",
@@ -229,7 +229,7 @@
229229
},
230230
"outputs": [],
231231
"source": [
232-
"X_train, X_test, y_train, y_test = train_test_split(income_df.drop(columns=TARGET_COLUMN), income_df[TARGET_COLUMN], \n",
232+
"X_train, X_test, y_train, y_test = train_test_split(income_df.drop(columns=TARGET_COLUMN), income_df[TARGET_COLUMN],\n",
233233
" test_size=TEST_RATIO, random_state=RANDOM_SEED)"
234234
]
235235
},
@@ -257,10 +257,12 @@
257257
"source": [
258258
"raw_data = pd.concat([X_test, y_test], axis=1)\n",
259259
"giskard_dataset = Dataset(\n",
260-
" df=raw_data, # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).\n",
260+
" df=raw_data,\n",
261+
" # A pandas.DataFrame that contains the raw data (before all the pre-processing steps) and the actual ground truth variable (target).\n",
261262
" target=TARGET_COLUMN, # Ground truth variable.\n",
262263
" name=\"salary_data\", # Optional.\n",
263-
" cat_columns=CATEGORICAL_FEATURES # List of categorical columns. Optional, but is a MUST if available. Inferred automatically if not.\n",
264+
" cat_columns=CATEGORICAL_FEATURES\n",
265+
" # List of categorical columns. Optional, but is a MUST if available. Inferred automatically if not.\n",
264266
")"
265267
]
266268
},
@@ -351,7 +353,8 @@
351353
"outputs": [],
352354
"source": [
353355
"giskard_model = Model(\n",
354-
" model=pipeline, # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.\n",
356+
" model=pipeline,\n",
357+
" # A prediction function that encapsulates all the data pre-processing steps and that could be executed with the dataset used by the scan.\n",
355358
" model_type=\"classification\", # Either regression, classification or text_generation.\n",
356359
" name=\"salary_cls\", # Optional.\n",
357360
" classification_labels=pipeline.classes_, # Their order MUST be identical to the prediction_function's output order.\n",
@@ -618,7 +621,7 @@
618621
"outputs": [],
619622
"source": [
620623
"# Create a Giskard client after having install the Giskard server (see documentation)\n",
621-
"api_key = \"<Giskard API key>\" #This can be found in the Settings tab of the Giskard hub\n",
624+
"api_key = \"<Giskard API key>\" #This can be found in the Settings tab of the Giskard hub\n",
622625
"#hf_token = \"<Your Giskard Space token>\" #If the Giskard Hub is installed on HF Space, this can be found on the Settings tab of the Giskard Hub\n",
623626
"\n",
624627
"client = GiskardClient(\n",

giskard/datasets/base/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,8 +143,6 @@ class Dataset(ColumnMetadataMixin):
143143
column_types (Optional[Dict[str, str]]):
144144
A dictionary of column names and their types (numeric, category or text) for all columns of df. If not provided,
145145
the categorical columns will be automatically inferred.
146-
data_processor (DataProcessor):
147-
An instance of the `DataProcessor` class used for data processing.
148146
"""
149147

150148
name: Optional[str]

0 commit comments

Comments
 (0)