From 3e1ec3d0474a321385710e3ab6712228c089d907 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Ko=C5=BCdo=C5=84?= <102428159+Kacper-W-Kozdon@users.noreply.github.com> Date: Thu, 9 May 2024 00:04:45 +0200 Subject: [PATCH] feat: detailed leaderboards (#19) * Update .gitignore * .gitignore * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * minor leaderboard changes * main test merge * changes to the online/offline switch handling * changes to offline saves * merge upd * offline save changes * offline save changes * offline save changes * offline save changes * fixed errors with undefined session states --- chatbot_arena.py | 130 ++++++--------------------------- helpers.py | 156 +++++++++++++++++++++++++++++++++++----- pages/1_leaderboards.py | 121 +++++++++++++++++++++++-------- 3 files changed, 255 insertions(+), 152 deletions(-) diff --git a/chatbot_arena.py b/chatbot_arena.py index c8ee3aa..559b8e4 100644 --- a/chatbot_arena.py +++ b/chatbot_arena.py @@ -15,100 +15,6 @@ ) -def init_session(mode: str = "keys") -> None: - """Initialize session states and databases. - - Parameters - ---------- - mode - defines elements to assign. - - ''keys'': initializes keys for st.session_state elements. - - ''online'': connects to the database (Google Sheets) and assigns its contents to - their corresponding session states. - - ''offline': loads the database from the project's folder and assigns its contents - to their corresponding session states. - Default: "keys". - - Returns - ------- - None - """ - - global all_models, data, json_data - if mode == "keys": - keys = [ - "chat_input", - "winner_selected", - "api_key_provided", - "vote1", - "vote2", - "model1", - "model2", - "scores", - "authenticated", - "new_models_selected", - "detailed_leaderboards", - "detail", - "new_source", - ] - for key in keys: - if key not in st.session_state.keys(): - st.session_state[key] = None - if "code_input" not in st.session_state.keys(): - st.session_state.code_input = " " - if "chat_history1" not in st.session_state.keys(): - st.session_state.chat_history1 = [] - if "chat_history2" not in st.session_state.keys(): - st.session_state.chat_history2 = [] - - if "model1_selectbox" not in st.session_state.keys(): - st.session_state.placeholder_model1 = "other" - if "model1_other" not in st.session_state.keys(): - st.session_state.placeholder_model1_other = "model@provider" - if "model2_selectbox" not in st.session_state.keys(): - st.session_state.placeholder_model2 = "other" - if "model2_other" not in st.session_state.keys(): - st.session_state.placeholder_model2_other = "model@provider" - - if "index_model1" not in st.session_state.keys(): - st.session_state.index_model1 = 0 - if "index_model2" not in st.session_state.keys(): - st.session_state.index_model2 = 0 - if "value_model1_other" not in st.session_state.keys(): - st.session_state.value_model1_other = "" - if "value_model2_other" not in st.session_state.keys(): - st.session_state.value_model2_other = "" - - if "api_key" not in st.session_state.keys(): - st.session_state.api_key = "" - - if "source" not in st.session_state.keys(): - st.session_state.source = False - - if "vote_counts" not in st.session_state: - st.session_state["vote_counts"] = { - model: {"Wins ⭐": 0, "Losses ❌": 0} for model in ["others"] - } - - if mode == "offline": - helpers.database.get_offline() - # Load JSON data from file - all_models = st.session_state.models - # model_options = [model.split("@")[0] for model in all_models] - data = pd.read_csv( - "leaderboard.csv" - ) # This will raise an error if the file does not exist - json_data = st.session_state.leaderboard - st.session_state["vote_counts"] = json_data - - if mode == "online": - helpers.database.get_online() - all_models = list(st.session_state.models) - json_data = st.session_state.leaderboard - data = {model: 0 for model in json_data.index} - st.session_state["vote_counts"] = json_data - - def select_model(api_key: str = "", authenticated: bool = False) -> None: """Select two models for the Unify API. The models are picked through selectbox options. @@ -128,7 +34,7 @@ def select_model(api_key: str = "", authenticated: bool = False) -> None: None """ - global json_data, all_models + global json_data, all_models, data disabled = not (bool(api_key) and bool(authenticated)) model1_other_disabled = True @@ -344,10 +250,26 @@ async def main() -> None: None """ - global all_models, data + global all_models, data, json_data st.set_option("deprecation.showPyplotGlobalUse", False) - init_session("keys") + helpers.init_session("keys") + if st.session_state.source is True: + source = "online" + else: + source = "offline" + helpers.init_session(source) + + all_models = list(st.session_state.models) + json_data = st.session_state.leaderboard + + if st.session_state.source is True: + data = {model: 0 for model in json_data.index} + else: + data = pd.read_csv( + "leaderboard.csv" + ) # This will raise an error if the file does not exist + st.session_state.code_input = "" st.markdown( """ @@ -365,14 +287,7 @@ async def main() -> None: type="password", ) input_api_key(api_key) - st.session_state.source = st.sidebar.checkbox( - "Use online database (google sheets).", - value=st.session_state.source, - on_change=lambda: setattr(st.session_state, "new_source", True), - ) - source = "online" if st.session_state.source is True else "offline" - if st.session_state.new_source in [True, None]: - init_session(source) + # Display sidebar widgets with st.sidebar: select_model(st.session_state.api_key, st.session_state.authenticated) @@ -478,8 +393,9 @@ async def main() -> None: : st.session_state["model2"].find("@") ] ) not in data.keys(): - st.session_state["vote_counts"][f"{model2_to_add}"]["Wins ⭐"] = 0 - st.session_state["vote_counts"][f"{model2_to_add}"]["Losses ❌"] = 0 + st.session_state["vote_counts"].at[f"{model2_to_add}", "Wins ⭐"] = 0 + st.session_state["vote_counts"].at[f"{model2_to_add}", "Losses ❌"] = 0 + st.session_state["vote_counts"].at[ f"{model2_to_add}", "Model Name" ] = f"{model2_to_add}" diff --git a/helpers.py b/helpers.py index d4feeb3..84878a9 100644 --- a/helpers.py +++ b/helpers.py @@ -12,7 +12,7 @@ class database: @staticmethod - def get_offline() -> None: + def get_offline(update: bool = False) -> None: """Static method. Assigns the local database's contents to the corresponding session states. @@ -56,10 +56,9 @@ def get_offline() -> None: "leaderboard.csv" ) # This will raise an error if the file does not exist json_data = { - model: {"Wins ⭐": wins, "Losses ❌": losses} - for model, wins, losses in zip( - data["Model Name"], data["Wins ⭐"], data["Losses ❌"] - ) + "Model Name": [model for model in data["Model Name"]], + "Wins ⭐": [wins for wins in data["Wins ⭐"]], + "Losses ❌": [losses for losses in data["Losses ❌"]], } if not os.path.exists("./detail_leaderboards.json"): @@ -99,12 +98,29 @@ def get_offline() -> None: json.dump(detail_leaderboards, out_file) with open("detail_leaderboards.csv", "r") as in_file: - st.session_state.detailed_leaderboards = { + st.session_state.offline_detailed = { "scores": pd.read_csv(in_file, index_col=0) } - st.session_state.leaderboard = json_data - st.session_state.models = all_models + st.session_state.offline_leaderboard = pd.DataFrame(json_data) + st.session_state.offline_models = all_models + + if not update: + st.session_state.leaderboard = pd.DataFrame(json_data) + st.session_state.detailed_leaderboard = st.session_state.offline_detailed + st.session_state.models = st.session_state.offline_models + + st.session_state.leaderboard[["Wins ⭐", "Losses ❌"]] = ( + st.session_state.leaderboard[["Wins ⭐", "Losses ❌"]].where( + st.session_state.leaderboard[["Wins ⭐", "Losses ❌"]] == 0, 0 + ) + ) + + st.session_state.detailed_leaderboard["scores"] = ( + st.session_state.detailed_leaderboard["scores"].where( + st.session_state.detailed_leaderboard["scores"] == 0, 0 + ) + ) @staticmethod def get_online(update: bool = False): @@ -199,21 +215,28 @@ def save_offline(): if key not in st.session_state.keys(): st.session_state[key] = None - sorted_counts = sorted( - st.session_state["vote_counts"].items(), - key=lambda x: x[1]["Wins ⭐"] + x[1]["Losses ❌"], - reverse=True, + database.get_offline(True) + + vote_counts_df = pd.DataFrame(st.session_state.vote_counts) + vote_counts_df[["Wins ⭐", "Losses ❌"]] = vote_counts_df[ + ["Wins ⭐", "Losses ❌"] + ].add( + st.session_state.offline_leaderboard[["Wins ⭐", "Losses ❌"]], fill_value=0 ) - for idx, votes in enumerate(sorted_counts): - sorted_counts[idx] = (votes[0], votes[1]["Wins ⭐"], votes[1]["Losses ❌"]) - sorted_counts_df = pd.DataFrame( - sorted_counts, columns=["Model Name", "Wins ⭐", "Losses ❌"] + sorted_counts_df = vote_counts_df[["Model Name", "Wins ⭐", "Losses ❌"]] + sorted_counts_df.sort_values(by=["Wins ⭐", "Losses ❌"], inplace=True) + + detail_leaderboards = st.session_state.detailed_leaderboards["scores"].add( + st.session_state.offline_detailed["scores"] ) + models = st.session_state.models + detail_leaderboards = st.session_state.detailed_leaderboards["scores"] detail_leaderboards.to_csv("detail_leaderboards.csv", index=False) sorted_counts_df.to_csv("leaderboard.csv", index=False) + models.to_csv("models.csv", index=False) @staticmethod def save_online(): @@ -272,4 +295,103 @@ def save_online(): st.cache_data.clear() st.experimental_rerun() - # Display our Spreadsheet as st.dataframe + +def init_session(mode: str = "keys") -> None: + """Initialize session states and databases. + + Parameters + ---------- + mode + defines elements to assign. + - ''keys'': initializes keys for st.session_state elements. + - ''online'': connects to the database (Google Sheets) and assigns its contents to + their corresponding session states. + - ''offline': loads the database from the project's folder and assigns its contents + to their corresponding session states. + Default: "keys". + + Returns + ------- + None + """ + + global all_models, data, json_data + if mode == "keys": + keys = [ + "chat_input", + "winner_selected", + "api_key_provided", + "vote1", + "vote2", + "model1", + "model2", + "scores", + "authenticated", + "new_models_selected", + "detailed_leaderboards", + "detail", + "new_source", + ] + for key in keys: + if key not in st.session_state.keys(): + st.session_state[key] = None + if "code_input" not in st.session_state.keys(): + st.session_state.code_input = " " + if "chat_history1" not in st.session_state.keys(): + st.session_state.chat_history1 = [] + if "chat_history2" not in st.session_state.keys(): + st.session_state.chat_history2 = [] + + if "model1_selectbox" not in st.session_state.keys(): + st.session_state.placeholder_model1 = "other" + if "model1_other" not in st.session_state.keys(): + st.session_state.placeholder_model1_other = "model@provider" + if "model2_selectbox" not in st.session_state.keys(): + st.session_state.placeholder_model2 = "other" + if "model2_other" not in st.session_state.keys(): + st.session_state.placeholder_model2_other = "model@provider" + + if "index_model1" not in st.session_state.keys(): + st.session_state.index_model1 = 0 + if "index_model2" not in st.session_state.keys(): + st.session_state.index_model2 = 0 + if "value_model1_other" not in st.session_state.keys(): + st.session_state.value_model1_other = "" + if "value_model2_other" not in st.session_state.keys(): + st.session_state.value_model2_other = "" + + if "api_key" not in st.session_state.keys(): + st.session_state.api_key = "" + + if "source" not in st.session_state.keys(): + st.session_state.source = False + + if "vote_counts" not in st.session_state: + st.session_state["vote_counts"] = { + model: {"Wins ⭐": 0, "Losses ❌": 0} for model in ["others"] + } + + if "enable_detail" not in st.session_state.keys(): + st.session_state.enable_detail = False + + if mode == "offline": + database.get_offline(st.session_state.new_source is True) + # Load JSON data from file + all_models = st.session_state.models + # model_options = [model.split("@")[0] for model in all_models] + data = pd.read_csv( + "leaderboard.csv" + ) # This will raise an error if the file does not exist + json_data = st.session_state.leaderboard + + st.session_state["vote_counts"] = pd.DataFrame( + json_data, columns=["Model Name", "Wins ⭐", "Losses ❌"] + ) + st.session_state["vote_counts"].set_index("Model Name", inplace=True) + + if mode == "online": + database.get_online(st.session_state.new_source is True) + all_models = list(st.session_state.models) + json_data = st.session_state.leaderboard + data = {model: 0 for model in json_data.index} + st.session_state["vote_counts"] = json_data diff --git a/pages/1_leaderboards.py b/pages/1_leaderboards.py index 8e8a7bf..2eaec13 100644 --- a/pages/1_leaderboards.py +++ b/pages/1_leaderboards.py @@ -21,20 +21,25 @@ ) # Create a DataFrame with the sorted vote counts if source == "offline": - sorted_counts = sorted( - st.session_state["vote_counts"].items(), - key=lambda x: x[1]["Wins ⭐"] + x[1]["Losses ❌"], - reverse=True, + vote_counts_df = pd.DataFrame(st.session_state.vote_counts) + vote_counts_df["Model Name"] = vote_counts_df.index + vote_counts_df[["Wins ⭐", "Losses ❌"]] = vote_counts_df[ + ["Wins ⭐", "Losses ❌"] + ].add(st.session_state.offline_leaderboard[["Wins ⭐", "Losses ❌"]], fill_value=0) + sorted_counts = vote_counts_df[["Model Name", "Wins ⭐", "Losses ❌"]] + sorted_counts.sort_values(by=["Wins ⭐", "Losses ❌"], inplace=True) + sorted_counts.index = range(sorted_counts.shape[0]) + + detail_leaderboards = st.session_state.detailed_leaderboard["scores"].add( + st.session_state.offline_detailed["scores"], fill_value=0 ) - for idx, votes in enumerate(sorted_counts): - sorted_counts[idx] = (votes[0], votes[1]["Wins ⭐"], votes[1]["Losses ❌"]) - detail_leaderboards = st.session_state.detailed_leaderboards - model_selection = list(detail_leaderboards["scores"].keys())[1:] + model_selection = list(detail_leaderboards.keys()) + detail_leaderboards = {"scores": detail_leaderboards} if source == "online": helpers.database.get_online(True) - detail_leaderboards = st.session_state.detailed_leaderboards["scores"].add( + detail_leaderboards = st.session_state.detailed_leaderboard["scores"].add( st.session_state.online_detailed["scores"], fill_value=0 ) @@ -54,30 +59,90 @@ ) sorted_counts_df.style.hide() -st.data_editor( - sorted_counts_df, num_rows="dynamic", use_container_width=True, hide_index=True -) +with st.sidebar: + enable_detail = st.checkbox( + "Enable detailed view", + value=st.session_state.enable_detail, + on_change=lambda: setattr( + st.session_state, "enable_detail", not st.session_state.enable_detail + ), + ) +sorted_counts_detail = sorted_counts_df.assign(Compare=False) +sorted_counts_detail = sorted_counts_detail[ + ["Compare", "Model Name", "Wins ⭐", "Losses ❌"] +] -c1, c2 = st.columns(2) -with c1: - model1_detail = st.selectbox("Select model 1", model_selection) -with c2: - model2_detail = st.selectbox("Select model 2", model_selection) -with st.container(border=True): - st.markdown( - f"

{model1_detail} : {model2_detail}

", - unsafe_allow_html=True, - ) - st.markdown( - f"

{int(detail_leaderboards['scores'].at[model1_detail, model2_detail])}:{int(detail_leaderboards['scores'].at[model2_detail, model1_detail])}

", - unsafe_allow_html=True, +detail_leaderboards = st.session_state.detailed_leaderboard +model_selection = list(detail_leaderboards["scores"].keys())[1:] + +if st.session_state.enable_detail: + select_for_comparison = st.data_editor( + sorted_counts_detail, num_rows="dynamic", use_container_width=True ) + models_to_compare = select_for_comparison.loc[select_for_comparison["Compare"]] + + model_names = models_to_compare["Model Name"] + + view_detail = detail_leaderboards["scores"].loc[model_names, model_names] + with st.container(border=True): + + if not model_names.empty: + st.markdown( + "

Detailed leaderboards:

", + unsafe_allow_html=True, + ) + st.markdown( + "
The values represent the number of wins of the row model against the column model
", + unsafe_allow_html=True, + ) + st.markdown( + "
(row, column) -> #row_wins
", + unsafe_allow_html=True, + ) + st.write(view_detail) + else: + st.markdown( + "

Select the models to compare.

", + unsafe_allow_html=True, + ) +else: + st.data_editor(sorted_counts_df, num_rows="dynamic", use_container_width=True) + + c1, c2 = st.columns(2) + with c1: + model1_detail = st.selectbox("Select model 1", model_selection) + with c2: + model2_detail = st.selectbox("Select model 2", model_selection) + with st.container(border=True): + st.markdown( + f"

{model1_detail} : {model2_detail}

", + unsafe_allow_html=True, + ) + st.markdown( + f"

{int(detail_leaderboards['scores'].at[model1_detail, model2_detail])}:{int(detail_leaderboards['scores'].at[model2_detail, model1_detail])}

", + unsafe_allow_html=True, + ) +enable_global = st.sidebar.checkbox( + "Enable global leaderboards", + value=st.session_state.source, + on_change=lambda: ( + setattr(st.session_state, "new_source", True), + setattr(st.session_state, "source", not st.session_state.source), + ), +) +source = "online" if st.session_state.source is True else "offline" +if st.session_state.new_source in [True, None]: + helpers.init_session(source) + st.session_state.new_source = False with st.sidebar: st.button("Save leaderboards", key="save") if st.session_state.save: - if source == "offline": - helpers.database.save_offline() - if source == "online": + + helpers.database.save_offline() + try: helpers.database.save_online() + except Exception as e: + st.write("Could not upload the results.") + st.write(e)