-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_analysis.py
73 lines (54 loc) · 2.49 KB
/
data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# import seaborn as sns
# import matplotlib.pyplot as plt
import pandas as pd
import os
import json
def find_stats(pre_train_df, fine_tune_df, n = 5):
emotions = list(pre_train_df)
emotions.remove("group")
difference = pre_train_df.loc[:, pre_train_df.columns != 'group'] - fine_tune_df.loc[:, fine_tune_df.columns != 'group']
difference["group"] = pre_train_df["group"]
dic = {}
for emotion in emotions:
max_change = difference[emotion].nlargest(n)
min_change = difference[emotion].nsmallest(n)
max_groups = list(difference["group"].loc[max_change.index])
min_groups = list(difference["group"].loc[min_change.index])
max_min = {"max_change":max_groups,"max_change_values":list(max_change) ,"min_change":min_groups, "min_change_values":list(min_change)}
dic[emotion] = max_min
return dic
def get_second_elements(root_folder):
output_dict = {}
# walk through all subdirectories and files in the directory
for dirpath, dirnames, filenames in os.walk(root_folder):
# create a list for current directory
dir_list = []
# iterate over all files
for filename in filenames:
# check if file is a JSON file
if filename.endswith('json'):
# construct full file path
file_path = os.path.join(dirpath, filename)
# load JSON file
with open(file_path, 'r') as file:
data = json.load(file)
# check if data is a list and has at least two elements
if isinstance(data, list) and len(data) >= 2:
# append second element to dir_list
dir_list.append(data[1])
# add dir_list to output_dict with folder name as key if it's not empty
if dir_list:
folder_name = os.path.basename(dirpath)
output_dict[folder_name] = dir_list
# convert output_dict to DataFrame
output_df = pd.DataFrame.from_dict(output_dict, orient='index')
return output_df
def row_means(dataframe):
# Compute mean for each row, ignore NaN values
means = dataframe.mean(axis=1, skipna=True)
# Convert to list
means_list = means.tolist()
return means_list
if __name__ == "__main__":
print(get_second_elements('out\\finetuned_stereoset_english\spearman_correlations_RSA'))
# print(row_means(get_second_elements('out\\spanish_finetune\spearman_correlations_RSA')))