Skip to content

Commit 2208b5e

Browse files
authored
Merge branch 'main' into GSK-1567-add-number-to-word-transformation
2 parents 0cb8fec + ad84608 commit 2208b5e

3 files changed

Lines changed: 21 additions & 26 deletions

File tree

.github/workflows/build-python.yml

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,11 @@ jobs:
6060
langchain_minimal: [false]
6161
# https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources
6262
include:
63-
- python-version: "3.10"
64-
os: windows-2019
65-
pydantic_v1: false
66-
pandas_v1: false
67-
langchain_minimal: false
63+
# - python-version: "3.10" # Deactivating windows-2019, since it's trying to use python 3.7 to install PDM. Maybe try to reactivate later ?
64+
# os: windows-2019
65+
# pydantic_v1: false
66+
# pandas_v1: false
67+
# langchain_minimal: false
6868
- python-version: "3.10"
6969
os: windows-2022
7070
pydantic_v1: false
@@ -250,6 +250,7 @@ jobs:
250250
uses: pdm-project/setup-pdm@v3
251251
with:
252252
python-version: '3.10'
253+
version: '2.10.4' # Fix to repair the CI, use latest version when fixed on pdm
253254
cache: false
254255
- name: Build wheel
255256
run: pdm build

giskard/scanner/robustness/text_transformations.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import itertools
22
import json
3-
import random
43
import re
54
from pathlib import Path
65

@@ -71,7 +70,7 @@ def execute(self, data: pd.DataFrame) -> pd.DataFrame:
7170
class TextTypoTransformation(TextTransformation):
7271
name = "Add typos"
7372

74-
def __init__(self, column, rate=0.05, min_length=10, rng_seed=None):
73+
def __init__(self, column, rate=0.05, min_length=10, rng_seed=1729):
7574
super().__init__(column)
7675
from .entity_swap import typos
7776

@@ -151,10 +150,11 @@ def make_perturbation(self, text):
151150
class TextLanguageBasedTransformation(TextTransformation):
152151
needs_dataset = True
153152

154-
def __init__(self, column):
153+
def __init__(self, column, rng_seed=1729):
155154
super().__init__(column)
156155
self._lang_dictionary = dict()
157156
self._load_dictionaries()
157+
self.rng = np.random.default_rng(seed=rng_seed)
158158

159159
def _load_dictionaries(self):
160160
raise NotImplementedError()
@@ -253,7 +253,7 @@ def make_perturbation(self, row):
253253
mask_value = f"__GSK__ENT__RELIGION__{n_list}__{n_term}__"
254254
text, num_rep = re.subn(rf"\b{re.escape(term)}(s?)\b", rf"{mask_value}\1", text, flags=re.IGNORECASE)
255255
if num_rep > 0:
256-
i = (n_term + 1 + random.randrange(len(term_list) - 1)) % len(term_list)
256+
i = (n_term + 1 + self.rng.choice(len(term_list) - 1)) % len(term_list)
257257
replacement = term_list[i]
258258
replacements.append((mask_value, replacement))
259259

@@ -295,7 +295,7 @@ def make_perturbation(self, row):
295295
)
296296
if num_rep > 0:
297297
r_income_type = "low-income" if income_type == "high-income" else "high-income"
298-
replacement = random.choice(nationalities_word_dict[entity_type][r_income_type])
298+
replacement = self.rng.choice(nationalities_word_dict[entity_type][r_income_type])
299299
replacements.append((mask_value, replacement))
300300

301301
# Replace masks

tests/scan/test_text_transformations.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import random
21
import re
32

43
import pandas as pd
@@ -202,9 +201,8 @@ def test_religion_based_transformation():
202201
)
203202
from giskard.scanner.robustness.text_transformations import TextReligionTransformation
204203

205-
t = TextReligionTransformation(column="text")
204+
t = TextReligionTransformation(column="text", rng_seed=10)
206205

207-
random.seed(0)
208206
transformed = dataset.transform(t)
209207
transformed_text = transformed.df.text.values
210208

@@ -213,12 +211,12 @@ def test_religion_based_transformation():
213211
"mois de ramadan."
214212
)
215213
assert (
216-
transformed_text[1] == "Une partie des chrétiens commémorent ce vendredi 5 mai la naissance, l’éveil et la "
217-
"mort de muhammad, dit « le Bouddha »"
214+
transformed_text[1] == "Une partie des hindous commémorent ce vendredi 5 mai la naissance, l’éveil et la "
215+
"mort de abraham, dit « le Bouddha »"
218216
)
219217
assert (
220218
transformed_text[2] == "Signs have also been placed in the direction of kumbh mela along one of the Peak "
221-
"District’s most popular hiking routes, Cave Dale, to help christians combine prayer "
219+
"District’s most popular hiking routes, Cave Dale, to help jews combine prayer "
222220
"with enjoying the outdoors."
223221
)
224222
assert (
@@ -228,9 +226,6 @@ def test_religion_based_transformation():
228226

229227

230228
def test_country_based_transformation():
231-
import random
232-
233-
random.seed(10)
234229
dataset = _dataset_from_dict(
235230
{
236231
"text": [
@@ -244,31 +239,30 @@ def test_country_based_transformation():
244239
)
245240
from giskard.scanner.robustness.text_transformations import TextNationalityTransformation
246241

247-
t = TextNationalityTransformation(column="text")
242+
t = TextNationalityTransformation(column="text", rng_seed=0)
248243

249244
transformed = dataset.transform(t)
250245
transformed_text = transformed.df.text.values
251246

252247
assert (
253-
transformed_text[0] == "Les musulmans de Eswatini fêtent vendredi 21 avril la fin du "
248+
transformed_text[0] == "Les musulmans de Saint Thomas et Prince fêtent vendredi 21 avril la fin du "
254249
"jeûne pratiqué durant le mois de ramadan."
255250
)
256-
assert transformed_text[1] == "Des incendies ravagent l'Congo depuis la fin août 2019."
251+
assert transformed_text[1] == "Des incendies ravagent l'Liban depuis la fin août 2019."
257252
assert (
258-
transformed_text[2] == "Bali is an Libyan island known for its forested volcanic mountains, iconic"
253+
transformed_text[2] == "Bali is an Singaporean island known for its forested volcanic mountains, iconic"
259254
" rice paddies, beaches and coral reefs. The island is home to religious sites "
260255
"such as cliffside Uluwatu Temple"
261256
)
262257
assert (
263258
transformed_text[3]
264-
== "President Joe Biden visited U.S.'s capital for the first time since Nigeria invaded the country"
259+
== "President Joe Biden visited UAE's capital for the first time since Syria invaded the country"
265260
)
266261

267262

268263
def test_country_based_transformation_edge_cases():
269264
from giskard.scanner.robustness.text_transformations import TextNationalityTransformation
270265

271-
random.seed(0)
272266
df = pd.DataFrame(
273267
{
274268
"text": [
@@ -281,7 +275,7 @@ def test_country_based_transformation_edge_cases():
281275
}
282276
)
283277

284-
t = TextNationalityTransformation(column="text")
278+
t = TextNationalityTransformation(column="text", rng_seed=0)
285279

286280
t1 = t.make_perturbation(df.iloc[0])
287281
t2 = t.make_perturbation(df.iloc[1])

0 commit comments

Comments
 (0)