@@ -83,20 +83,19 @@ def _calculate_drift_psi(actual_series, reference_series, max_categories):
8383 expected_distribution = expected_frequencies / len (reference_series )
8484 actual_distribution = actual_frequencies / len (actual_series )
8585 total_psi = 0
86- output_data = pd . DataFrame ( columns = [ "Modality" , "Reference_distribution" , "Actual_distribution" , "Psi" ])
87- for category in range ( len ( all_modalities ) ):
86+ output_data = []
87+ for category , modality in enumerate ( all_modalities ):
8888 modality_psi = _calculate_psi (category , actual_distribution , expected_distribution )
89-
9089 total_psi += modality_psi
91- row = {
92- "Modality" : all_modalities [ category ],
93- "Reference_distribution " : expected_distribution [ category ] ,
94- "Actual_distribution " : expected_distribution [category ],
95- "Psi " : modality_psi ,
96- }
97-
98- output_data = output_data . append ( pd . Series ( row ), ignore_index = True )
99- return total_psi , output_data
90+ output_data . append (
91+ {
92+ "Modality " : modality ,
93+ "Reference_distribution " : expected_distribution [category ],
94+ "Actual_distribution " : expected_distribution [ category ] ,
95+ "Psi" : modality_psi ,
96+ }
97+ )
98+ return total_psi , pd . DataFrame ( output_data )
10099
101100
102101def _calculate_ks (actual_series , reference_series ) -> Ks_2sampResult :
@@ -131,28 +130,29 @@ def _calculate_chi_square(actual_series, reference_series, max_categories):
131130 # so that reference and actual has the same size
132131 # See https://github.com/scipy/scipy/blob/v1.8.0/scipy/stats/_stats_py.py#L6787
133132 k_norm = actual_series .shape [0 ] / reference_series .shape [0 ]
134- output_data = pd . DataFrame ( columns = [ "Modality" , "Reference_frequencies" , "Actual_frequencies" , "Chi_square" ])
135- for i in range ( len ( all_modalities ) ):
133+ output_data = []
134+ for i , modality in enumerate ( all_modalities ):
136135 chi_square_value = (actual_frequencies [i ] - expected_frequencies [i ] * k_norm ) ** 2 / (
137136 expected_frequencies [i ] * k_norm
138137 )
139138 chi_square += chi_square_value
140139
141- row = {
142- "Modality" : all_modalities [i ],
143- "Reference_frequencies" : expected_frequencies [i ],
144- "Actual_frequencies" : actual_frequencies [i ],
145- "Chi_square" : chi_square_value ,
146- }
140+ output_data .append (
141+ {
142+ "Modality" : modality ,
143+ "Reference_frequencies" : expected_frequencies [i ],
144+ "Actual_frequencies" : actual_frequencies [i ],
145+ "Chi_square" : chi_square_value ,
146+ }
147+ )
147148
148- output_data = output_data .append (pd .Series (row ), ignore_index = True )
149149 # if reference_series and actual_series has only one modality it turns nan (len(all_modalities)=1)
150150 if len (all_modalities ) > 1 :
151151 chi_cdf = chi2 .cdf (chi_square , len (all_modalities ) - 1 )
152152 p_value = 1 - chi_cdf if chi_cdf != 0 else 0
153153 else :
154154 p_value = 0
155- return chi_square , p_value , output_data
155+ return chi_square , p_value , pd . DataFrame ( output_data )
156156
157157
158158def _validate_feature_type (gsk_dataset , column_name , feature_type ):
0 commit comments