From 3d79e6e31044357ff1936aa9c4e5a02638f8abdd Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 18:51:30 +0530 Subject: [PATCH 01/48] Update extension.py --- openml/extensions/sklearn/extension.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 82d202e9c..98c0a2e6d 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -2101,6 +2101,20 @@ def instantiate_model_from_hpo_class( return base_estimator def _extract_trace_data(self, model, rep_no, fold_no): + """Extracts data from a machine learning model's cross-validation results and creates an ARFF (Attribute-Relation File Format) trace. + + Parameters + ---------- + model : Any + A fitted hyperparameter optimization model. + rep_no : int + The repetition number. + fold_no : int + The fold number. + Returns + ------- + A list of ARFF tracecontent. + """ arff_tracecontent = [] for itt_no in range(0, len(model.cv_results_["mean_test_score"])): # we use the string values for True and False, as it is defined in From 2c4519ed80c816044b4b0163eefea217a34548b3 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 18:54:20 +0530 Subject: [PATCH 02/48] Update task.py --- openml/tasks/task.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 36e0ada1c..f8783c785 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -36,6 +36,8 @@ class OpenMLTask(OpenMLBase): Parameters ---------- + task_id : Union[int, None] + Refers to the unique identifier of a task. task_type_id : TaskType Refers to the type of task. task_type : str @@ -44,6 +46,14 @@ class OpenMLTask(OpenMLBase): Refers to the data. estimation_procedure_id: int Refers to the type of estimates used. + estimation_procedure_type : Optional[str] + Refers to the type of estimation procedure used for the task. + estimation_parameters : Optional[Dict[str, str]] + Estimation parameters used for the task. + evaluation_measure : Optional[str] + Refers to the evaluation measure. + data_splits_url : Optional[str] + Refers to the URL of the data splits used for the task. """ def __init__( From 7ddbdf4fc45faeb15422061e9dbadf5f4c05f7d2 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 18:56:22 +0530 Subject: [PATCH 03/48] Update flow.py --- openml/flows/flow.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/openml/flows/flow.py b/openml/flows/flow.py index b9752e77c..f1c5935e8 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -523,6 +523,18 @@ def get_subflow(self, structure): def _copy_server_fields(source_flow, target_flow): + """ Recursively copies the fields added by the server from the `source_flow` to the `target_flow`. + + Parameters + ---------- + source_flow : OpenMLFlow + To copy the fields from. + target_flow : OpenMLFlow + To copy the fields to. + Returns + ------- + None + """ fields_added_by_the_server = ["flow_id", "uploader", "version", "upload_date"] for field in fields_added_by_the_server: setattr(target_flow, field, getattr(source_flow, field)) @@ -533,5 +545,19 @@ def _copy_server_fields(source_flow, target_flow): def _add_if_nonempty(dic, key, value): + """ Adds a key-value pair to a dictionary if the value is not None. + + Parameters + ---------- + dic: dict + To add the key-value pair to. + key: hashable + To add to the dictionary. + value: Any + To add to the dictionary. + Returns + ------- + None + """ if value is not None: dic[key] = value From 7d5a04fc460aff1bc142d314dd51fa907440d3c9 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 18:59:17 +0530 Subject: [PATCH 04/48] Update functions.py --- openml/flows/functions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index c4faded0a..11b26d367 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -337,6 +337,18 @@ def get_flow_id( def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: + """ + Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. + Parameters + ---------- + api_call: str + Retrieves the information about flows. + output_format: str in {"dict", "dataframe"} + The output format. + Returns + ------- + The flows information in the specified output format. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",)) From 58ef19d8ef2dcfb3252a24dd5bbfda563be7c59a Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:07:43 +0530 Subject: [PATCH 05/48] Update functions.py --- openml/runs/functions.py | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index ee582dbb7..5aaca77d7 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -128,6 +128,16 @@ def run_model_on_task( flow = extension.model_to_flow(model) def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask: + """ Retrieve an OpenMLTask object from either an integer or string ID, or directly from an OpenMLTask object. + Parameters + ---------- + task : Union[int, str, OpenMLTask] + The task ID or the OpenMLTask object. + Returns + ------- + OpenMLTask + The OpenMLTask object. + """ if isinstance(task, (int, str)): return get_task(int(task)) else: @@ -451,6 +461,27 @@ def _run_task_get_arffcontent( "OrderedDict[str, OrderedDict]", "OrderedDict[str, OrderedDict]", ]: + """ Runs the hyperparameter optimization on the given task and returns the arfftrace content. + Parameters + ---------- + model : Any + The model that is to be evalauted. + task : OpenMLTask + The OpenMLTask to evaluate. + extension : Extension + The OpenML extension object. + add_local_measures : bool + Whether to compute additional local evaluation measures. + dataset_format : str + The format in which to download the dataset. + n_jobs : int + Number of jobs to run in parallel. If None, use 1 core by default. If -1, use all available cores. + + Returns + ------- + Tuple[List[List], Optional[OpenMLRunTrace], OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] + A tuple containing the arfftrace content, the OpenML run trace, the global and local evaluation measures. + """ arff_datacontent = [] # type: List[List] traces = [] # type: List[OpenMLRunTrace] # stores fold-based evaluation measures. In case of a sample based task, @@ -636,6 +667,35 @@ def _run_task_get_arffcontent_parallel_helper( Optional[OpenMLRunTrace], "OrderedDict[str, float]", ]: + """ Helper function that runs a single model on a single task fold sample. + + Parameters + ---------- + extension : Extension + An OpenML extension instance. + fold_no : int + The fold number to be run. + model : Any + The model that is to be evaluated. + rep_no : int + Repetition number to be run. + sample_no : int + Sample number to be run. + task : OpenMLTask + The task object from OpenML. + dataset_format : str + The dataset format to be used. + configuration : Dict + Hyperparameters to configure the model. + + Returns + ------- + Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame], + Optional[OpenMLRunTrace], OrderedDict[str, float]] + A tuple containing the predictions, probability estimates (if applicable), + actual target values, actual target value probabilities (if applicable), + the trace object of the OpenML run (if applicable), and a dictionary of local measures for this particular fold. + """ # Sets up the OpenML instantiated in the child process to match that of the parent's # if configuration=None, loads the default config._setup(configuration) From 63a20bbbd06996bc722e9f7e9eab8999f8ab792b Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:12:11 +0530 Subject: [PATCH 06/48] Update trace.py --- openml/runs/trace.py | 50 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/openml/runs/trace.py b/openml/runs/trace.py index f6b038a55..85fd55334 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -33,7 +33,17 @@ class OpenMLRunTrace(object): """ - def __init__(self, run_id, trace_iterations): + def __init__(self, run_id: int, trace_iterations: List[List]): + """ + Object to hold the trace content of a run. + + Parameters + ---------- + run_id : int + Id for which the trace content is to be stored. + trace_iterations : List[List] + The trace content obtained by running a flow on a task. + """ self.run_id = run_id self.trace_iterations = trace_iterations @@ -228,6 +238,24 @@ def trace_from_arff(cls, arff_obj): @classmethod def _trace_from_arff_struct(cls, attributes, content, error_message): + """ Generate a trace dictionary from ARFF structure. + + Parameters + ---------- + cls : type + The trace object to be created. + attributes : List[Tuple[str, str]] + Attribute descriptions. + content : List[List[Union[int, float, str]]] + List of instances. + error_message : str + Error message to raise if `setup_string` is in `attributes`. + + Returns + ------- + OrderedDict + A dictionary representing the trace. + """ trace = OrderedDict() attribute_idx = {att[0]: idx for idx, att in enumerate(attributes)} @@ -345,6 +373,26 @@ def trace_from_xml(cls, xml): @classmethod def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace": + """Merge multiple traces into a single trace. + + Parameters + ---------- + cls : type + Type of the trace object to be created. + traces : List[OpenMLRunTrace] + List of traces to merge. + + Returns + ------- + OpenMLRunTrace + A trace object representing the merged traces. + + Raises + ------ + ValueError + If the parameters in the iterations of the traces being merged are not equal. + If a key (repeat, fold, iteration) is encountered twice while merging the traces. + """ merged_trace = ( OrderedDict() ) # type: OrderedDict[Tuple[int, int, int], OpenMLTraceIteration] # noqa E501 From 0197de31527429e5bff58cb0267a072d94cde97a Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:16:22 +0530 Subject: [PATCH 07/48] Update functions.py --- openml/setups/functions.py | 39 +++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 52969fb8c..069c3c38e 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -60,8 +60,24 @@ def setup_exists(flow) -> int: return setup_id if setup_id > 0 else False -def _get_cached_setup(setup_id): - """Load a run from the cache.""" +def _get_cached_setup(setup_id: int): + """Load a run from the cache. + + Parameters + ---------- + setup_id : int + ID of the setup to be loaded. + + Returns + ------- + OpenMLSetup + The loaded setup object. + + Raises + ------ + OpenMLCacheException + If the setup file for the given setup ID is not cached. + """ cache_dir = config.get_cache_directory() setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id)) try: @@ -271,7 +287,21 @@ def initialize_model(setup_id: int) -> Any: return model -def _to_dict(flow_id, openml_parameter_settings): +def _to_dict(flow_id: int, openml_parameter_settings): + """ Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. + + Parameters + ---------- + flow_id : int + ID of the flow. + openml_parameter_settings : List[OpenMLParameter] + A list of OpenML parameter settings. + + Returns + ------- + OrderedDict + A dictionary representation of the flow ID and parameter settings. + """ # for convenience, this function (ab)uses the run object. xml = OrderedDict() xml["oml:run"] = OrderedDict() @@ -319,6 +349,9 @@ def _create_setup_from_xml(result_dict, output_format="object"): def _create_setup_parameter_from_xml(result_dict, output_format="object"): + """ + Create an OpenMLParameter object or a dictionary from an API xml result. + """ if output_format == "object": return OpenMLParameter( input_id=int(result_dict["oml:id"]), From 1385155e76e23ae39e611953c6d7853e66bef05a Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:20:57 +0530 Subject: [PATCH 08/48] Update functions.py --- openml/study/functions.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/openml/study/functions.py b/openml/study/functions.py index 7b72a31eb..d84f12580 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -107,6 +107,20 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: tags.append(current_tag) def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: + """ Extracts a list of nested IDs from a result dictionary. + + Parameters + ---------- + key : str + Nested OpenML IDs. + subkey : str + The subkey contains the nested OpenML IDs. + + Returns + ------- + Optional[List] + A list of nested OpenML IDs, or None if the key is not present in the dictionary. + """ if result_dict.get(key) is not None: return [int(oml_id) for oml_id in result_dict[key][subkey]] return None @@ -591,6 +605,20 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: + """ Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. + + Parameters + ---------- + api_call : str + The API call for retrieving the list of OpenML studies. + output_format : str in {"object", "dataframe"} + Format of the output, either 'object' for a dictionary or 'dataframe' for a Pandas DataFrame. + + Returns + ------- + Union[Dict, pd.DataFrame] + A dictionary or Pandas DataFrame of OpenML studies, depending on the value of 'output_format'. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) From 43ed1522eecd2b6328907a7650d5368a8fb48948 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:22:57 +0530 Subject: [PATCH 09/48] Update functions.py --- openml/tasks/functions.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 00a8e822d..e0efb9c5e 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -230,6 +230,28 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs): def __list_tasks(api_call, output_format="dict"): + """ Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. + + Parameters + ---------- + api_call : str + The API call specifying which tasks to return. + output_format : str in {"dict", "dataframe"} + Output format for the returned object. + + Returns + ------- + Union[Dict, pd.DataFrame] + A dictionary or a Pandas DataFrame with information about OpenML tasks. + + Raises + ------ + ValueError + If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', or has an incorrect value for + '@xmlns:oml'. + KeyError + If an invalid key is found in the XML for a task. + """ xml_string = openml._api_calls._perform_api_call(api_call, "get") tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input")) # Minimalistic check if the XML is useful From 6c5f9d4f3ecbe825f8ff722fb635108826d87b5f Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:26:03 +0530 Subject: [PATCH 10/48] Update split.py --- openml/tasks/split.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/openml/tasks/split.py b/openml/tasks/split.py index e47c6040a..bea7f9390 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -136,9 +136,48 @@ def _from_arff_file(cls, filename: str) -> "OpenMLSplit": return cls(name, "", repetitions) def from_dataset(self, X, Y, folds, repeats): + """ Generates a new OpenML dataset object from input data and cross-validation settings. + + Parameters + ---------- + X : array-like or sparse matrix + The input feature matrix. + Y : array-like, shape + The target variable values. + folds : int + Number of cross-validation folds to generate. + repeats : int + Number of times to repeat the cross-validation process. + + Raises + ------ + NotImplementedError + This method is not implemented yet. + """ raise NotImplementedError() def get(self, repeat=0, fold=0, sample=0): + """ Returns the specified data split from the CrossValidationSplit object. + + Parameters + ---------- + repeat : int + Index of the repeat to retrieve. + fold : int + Index of the fold to retrieve. + sample : int + Index of the sample to retrieve. + + Returns + ------- + numpy.ndarray + The data split for the specified repeat, fold, and sample. + + Raises + ------ + ValueError + If the specified repeat, fold, or sample is not known. + """ if repeat not in self.split: raise ValueError("Repeat %s not known" % str(repeat)) if fold not in self.split[repeat]: From e4db820a1a19373f5b3e1d4dc06defbe911691b8 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 16 Aug 2023 19:58:36 +0530 Subject: [PATCH 11/48] Update task.py --- openml/tasks/task.py | 146 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 128 insertions(+), 18 deletions(-) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index f8783c785..26b275572 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -36,24 +36,24 @@ class OpenMLTask(OpenMLBase): Parameters ---------- - task_id : Union[int, None] - Refers to the unique identifier of a task. - task_type_id : TaskType - Refers to the type of task. - task_type : str - Refers to the task. + task_id: Union[int, None] + Refers to the unique identifier of OpenML task. + task_type_id: TaskType + Refers to the type of OpenML task. + task_type: str + Refers to the OpenML task. data_set_id: int Refers to the data. estimation_procedure_id: int Refers to the type of estimates used. - estimation_procedure_type : Optional[str] - Refers to the type of estimation procedure used for the task. - estimation_parameters : Optional[Dict[str, str]] - Estimation parameters used for the task. - evaluation_measure : Optional[str] + estimation_procedure_type: str, default=None + Refers to the type of estimation procedure used for the OpenML task. + estimation_parameters: [Dict[str, str]], default=None + Estimation parameters used for the OpenML task. + evaluation_measure: str, default=None Refers to the evaluation measure. - data_splits_url : Optional[str] - Refers to the URL of the data splits used for the task. + data_splits_url: str, default=None + Refers to the URL of the data splits used for the OpenML task. """ def __init__( @@ -216,8 +216,26 @@ class OpenMLSupervisedTask(OpenMLTask, ABC): Parameters ---------- + task_type_id : TaskType + ID of the task type. + task_type : str + Name of the task type. + data_set_id : int + ID of the OpenML dataset associated with the task. target_name : str Name of the target feature (the class variable). + estimation_procedure_id : int, default=None + ID of the estimation procedure for the task. + estimation_procedure_type : str, default=None + Type of the estimation procedure for the task. + estimation_parameters : dict, default=None + Estimation parameters for the task. + evaluation_measure : str, default=None + Name of the evaluation measure for the task. + data_splits_url : str, default=None + URL of the data splits for the task. + task_id: Union[int, None] + Refers to the unique identifier of task. """ def __init__( @@ -319,8 +337,30 @@ class OpenMLClassificationTask(OpenMLSupervisedTask): Parameters ---------- - class_labels : List of str (optional) - cost_matrix: array (optional) + task_type_id : TaskType + ID of the Classification task type. + task_type : str + Name of the Classification task type. + data_set_id : int + ID of the OpenML dataset associated with the Classification task. + target_name : str + Name of the target variable. + estimation_procedure_id : int, default=None + ID of the estimation procedure for the Classification task. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Estimation parameters for the Classification task. + evaluation_measure : str, default=None + Name of the evaluation measure. + data_splits_url : str, default=None + URL of the data splits for the Classification task. + task_id : Union[int, None] + ID of the Classification task (if it already exists on OpenML). + class_labels : List of str, default=None + A list of class labels (for classification tasks). + cost_matrix : array, default=None + A cost matrix (for classification tasks). """ def __init__( @@ -358,7 +398,31 @@ def __init__( class OpenMLRegressionTask(OpenMLSupervisedTask): - """OpenML Regression object.""" + """OpenML Regression object. + + Parameters + ---------- + task_type_id : TaskType + Task type ID of the OpenML Regression task. + task_type : str + Task type of the OpenML Regression task. + data_set_id : int + ID of the OpenML dataset. + target_name : str + Name of the target feature used in the Regression task. + estimation_procedure_id : int, default=None + ID of the OpenML estimation procedure. + estimation_procedure_type : str, default=None + Type of the OpenML estimation procedure. + estimation_parameters : dict, default=None + Parameters used by the OpenML estimation procedure. + data_splits_url : str, default=None + URL of the OpenML data splits for the Regression task. + task_id : Union[int, None] + ID of the OpenML Regression task. + evaluation_measure : str, default=None + Evaluation measure used in the Regression task. + """ def __init__( self, @@ -392,7 +456,25 @@ class OpenMLClusteringTask(OpenMLTask): Parameters ---------- - target_name : str (optional) + task_type_id : TaskType + Task type ID of the OpenML clustering task. + task_type : str + Task type of the OpenML clustering task. + data_set_id : int + ID of the OpenML dataset used in clustering the task. + estimation_procedure_id : int, default=None + ID of the OpenML estimation procedure. + task_id : Union[int, None] + ID of the OpenML clustering task. + estimation_procedure_type : str, default=None + Type of the OpenML estimation procedure used in the clustering task. + estimation_parameters : dict, default=None + Parameters used by the OpenML estimation procedure. + data_splits_url : str, default=None + URL of the OpenML data splits for the clustering task. + evaluation_measure : str, default=None + Evaluation measure used in the clustering task. + target_name : str, default=None Name of the target feature (class) that is not part of the feature set for the clustering task. """ @@ -469,7 +551,35 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": class OpenMLLearningCurveTask(OpenMLClassificationTask): - """OpenML Learning Curve object.""" + """OpenML Learning Curve object. + + Parameters + ---------- + task_type_id : TaskType + ID of the Learning Curve task. + task_type : str + Name of the Learning Curve task. + data_set_id : int + ID of the dataset that this task is associated with. + target_name : str + Name of the target feature in the dataset. + estimation_procedure_id : int, default=None + ID of the estimation procedure to use for evaluating models. + estimation_procedure_type : str, default=None + Type of the estimation procedure. + estimation_parameters : dict, default=None + Additional parameters for the estimation procedure. + data_splits_url : str, default=None + URL of the file containing the data splits for Learning Curve task. + task_id : Union[int, None] + ID of the Learning Curve task. + evaluation_measure : str, default=None + Name of the evaluation measure to use for evaluating models. + class_labels : list of str, default=None + Class labels for Learning Curve tasks. + cost_matrix : numpy array, default=None + Cost matrix for Learning Curve tasks. + """ def __init__( self, From 994854103176f392e52e18a48cc425c07dc0c21c Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:37:40 +0530 Subject: [PATCH 12/48] Update openml/flows/flow.py Co-authored-by: Lennart Purucker --- openml/flows/flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/flows/flow.py b/openml/flows/flow.py index f1c5935e8..70ecaa3a3 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -523,7 +523,7 @@ def get_subflow(self, structure): def _copy_server_fields(source_flow, target_flow): - """ Recursively copies the fields added by the server from the `source_flow` to the `target_flow`. + """Recursively copies the fields added by the server from the `source_flow` to the `target_flow`. Parameters ---------- From 1f7979344b9d2271e72044b7b2a4c139c3356264 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:40:57 +0530 Subject: [PATCH 13/48] Update openml/flows/flow.py Co-authored-by: Lennart Purucker --- openml/flows/flow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 70ecaa3a3..42e5948b7 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -545,7 +545,7 @@ def _copy_server_fields(source_flow, target_flow): def _add_if_nonempty(dic, key, value): - """ Adds a key-value pair to a dictionary if the value is not None. + """Adds a key-value pair to a dictionary if the value is not None. Parameters ---------- From bb0077c12f02a7255d3c32bc5c01fce5080895e2 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:41:09 +0530 Subject: [PATCH 14/48] Update openml/flows/functions.py Co-authored-by: Lennart Purucker --- openml/flows/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 11b26d367..3bb044f58 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -337,8 +337,8 @@ def get_flow_id( def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: - """ - Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. + """Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. + Parameters ---------- api_call: str From e986733d8f3d12cccb0f5a47fe44860c317f51f3 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:41:20 +0530 Subject: [PATCH 15/48] Update openml/extensions/sklearn/extension.py Co-authored-by: Lennart Purucker --- openml/extensions/sklearn/extension.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 98c0a2e6d..2da49eb72 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -2103,17 +2103,17 @@ def instantiate_model_from_hpo_class( def _extract_trace_data(self, model, rep_no, fold_no): """Extracts data from a machine learning model's cross-validation results and creates an ARFF (Attribute-Relation File Format) trace. - Parameters - ---------- - model : Any - A fitted hyperparameter optimization model. - rep_no : int - The repetition number. - fold_no : int - The fold number. - Returns - ------- - A list of ARFF tracecontent. + Parameters + ---------- + model : Any + A fitted hyperparameter optimization model. + rep_no : int + The repetition number. + fold_no : int + The fold number. + Returns + ------- + A list of ARFF tracecontent. """ arff_tracecontent = [] for itt_no in range(0, len(model.cv_results_["mean_test_score"])): From 426e5fbc99094a103a0b52148455d50c25f510b4 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:41:32 +0530 Subject: [PATCH 16/48] Update openml/flows/functions.py Co-authored-by: Lennart Purucker --- openml/flows/functions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 3bb044f58..a84238f36 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -346,6 +346,7 @@ def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.D output_format: str in {"dict", "dataframe"} The output format. Returns + ------- The flows information in the specified output format. """ From 581d47ae0e593b2ab57021677603ff56d631c6d8 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:41:45 +0530 Subject: [PATCH 17/48] Update openml/runs/functions.py Co-authored-by: Lennart Purucker --- openml/runs/functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 5aaca77d7..b3adcecdf 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -128,11 +128,13 @@ def run_model_on_task( flow = extension.model_to_flow(model) def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask: - """ Retrieve an OpenMLTask object from either an integer or string ID, or directly from an OpenMLTask object. + """Retrieve an OpenMLTask object from either an integer or string ID, or directly from an OpenMLTask object. + Parameters ---------- task : Union[int, str, OpenMLTask] The task ID or the OpenMLTask object. + Returns ------- OpenMLTask From 31d6e039fc638cf8d5f62346e275ff06003c016f Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:42:03 +0530 Subject: [PATCH 18/48] Update openml/runs/functions.py Co-authored-by: Lennart Purucker --- openml/runs/functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index b3adcecdf..23a01838c 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -463,7 +463,8 @@ def _run_task_get_arffcontent( "OrderedDict[str, OrderedDict]", "OrderedDict[str, OrderedDict]", ]: - """ Runs the hyperparameter optimization on the given task and returns the arfftrace content. + """Runs the hyperparameter optimization on the given task and returns the arfftrace content. + Parameters ---------- model : Any From 4dfe6fe24ee73510b86473d2dad867145eba52c7 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:42:18 +0530 Subject: [PATCH 19/48] Update openml/runs/functions.py Co-authored-by: Lennart Purucker --- openml/runs/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 23a01838c..645663662 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -670,7 +670,7 @@ def _run_task_get_arffcontent_parallel_helper( Optional[OpenMLRunTrace], "OrderedDict[str, float]", ]: - """ Helper function that runs a single model on a single task fold sample. + """Helper function that runs a single model on a single task fold sample. Parameters ---------- From 3c29b1d5ef06b136099680b7c4800857fa4d6e4f Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:42:33 +0530 Subject: [PATCH 20/48] Update openml/runs/trace.py Co-authored-by: Lennart Purucker --- openml/runs/trace.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 85fd55334..5a0992f82 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -34,8 +34,7 @@ class OpenMLRunTrace(object): """ def __init__(self, run_id: int, trace_iterations: List[List]): - """ - Object to hold the trace content of a run. + """Object to hold the trace content of a run. Parameters ---------- From bf07329882597541c517367bae81e40e44ca7af2 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:42:45 +0530 Subject: [PATCH 21/48] Update openml/runs/trace.py Co-authored-by: Lennart Purucker --- openml/runs/trace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 5a0992f82..28171900f 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -237,7 +237,7 @@ def trace_from_arff(cls, arff_obj): @classmethod def _trace_from_arff_struct(cls, attributes, content, error_message): - """ Generate a trace dictionary from ARFF structure. + """Generate a trace dictionary from ARFF structure. Parameters ---------- From 711303a4729645d6883cf2d6ed18862dae94e4cc Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:42:57 +0530 Subject: [PATCH 22/48] Update openml/setups/functions.py Co-authored-by: Lennart Purucker --- openml/setups/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 069c3c38e..76cce7014 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -288,7 +288,7 @@ def initialize_model(setup_id: int) -> Any: def _to_dict(flow_id: int, openml_parameter_settings): - """ Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. + """Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. Parameters ---------- From ad8aec23f70fa6b2b54c2cf2bb194e4fae6a35bf Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:43:09 +0530 Subject: [PATCH 23/48] Update openml/study/functions.py Co-authored-by: Lennart Purucker --- openml/study/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/study/functions.py b/openml/study/functions.py index d84f12580..461b96ba5 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -107,7 +107,7 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: tags.append(current_tag) def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: - """ Extracts a list of nested IDs from a result dictionary. + """Extracts a list of nested IDs from a result dictionary. Parameters ---------- From 2150d64e94a77dae4a30ad5938753465528419ba Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:43:20 +0530 Subject: [PATCH 24/48] Update openml/study/functions.py Co-authored-by: Lennart Purucker --- openml/study/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/study/functions.py b/openml/study/functions.py index 461b96ba5..cf9587a45 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -605,7 +605,7 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: - """ Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. + """Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. Parameters ---------- From f9fee1eccec7355f19ad483ab4b456b9f7fbfc97 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:43:28 +0530 Subject: [PATCH 25/48] Update openml/tasks/functions.py Co-authored-by: Lennart Purucker --- openml/tasks/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index e0efb9c5e..2a327ddb0 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -230,7 +230,7 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs): def __list_tasks(api_call, output_format="dict"): - """ Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. + """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. Parameters ---------- From 3776710ac36c074e3b3940e58bc5e984b152babf Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:43:37 +0530 Subject: [PATCH 26/48] Update openml/tasks/split.py Co-authored-by: Lennart Purucker --- openml/tasks/split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/tasks/split.py b/openml/tasks/split.py index bea7f9390..655126d3a 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -136,7 +136,7 @@ def _from_arff_file(cls, filename: str) -> "OpenMLSplit": return cls(name, "", repetitions) def from_dataset(self, X, Y, folds, repeats): - """ Generates a new OpenML dataset object from input data and cross-validation settings. + """Generates a new OpenML dataset object from input data and cross-validation settings. Parameters ---------- From 091335da207be1c36ef31dd8dcb57fa1be346148 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Fri, 18 Aug 2023 16:43:43 +0530 Subject: [PATCH 27/48] Update openml/tasks/split.py Co-authored-by: Lennart Purucker --- openml/tasks/split.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 655126d3a..2df3aad72 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -157,7 +157,7 @@ def from_dataset(self, X, Y, folds, repeats): raise NotImplementedError() def get(self, repeat=0, fold=0, sample=0): - """ Returns the specified data split from the CrossValidationSplit object. + """Returns the specified data split from the CrossValidationSplit object. Parameters ---------- From 2f9b2ad80e13b4e930e75d5877fb8207817b850b Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Mon, 21 Aug 2023 10:11:47 +0530 Subject: [PATCH 28/48] Update openml/tasks/task.py From 571951f18a67c6ec58f3649bec5acf4f22b56458 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 04:42:21 +0000 Subject: [PATCH 29/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/extensions/sklearn/extension.py | 2 +- openml/flows/flow.py | 4 ++-- openml/flows/functions.py | 4 ++-- openml/runs/functions.py | 16 ++++++++-------- openml/runs/trace.py | 12 ++++++------ openml/setups/functions.py | 12 ++++++------ openml/study/functions.py | 8 ++++---- openml/tasks/functions.py | 6 +++--- openml/tasks/split.py | 10 +++++----- 9 files changed, 37 insertions(+), 37 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 2da49eb72..d27b29293 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -2102,7 +2102,7 @@ def instantiate_model_from_hpo_class( def _extract_trace_data(self, model, rep_no, fold_no): """Extracts data from a machine learning model's cross-validation results and creates an ARFF (Attribute-Relation File Format) trace. - + Parameters ---------- model : Any diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 42e5948b7..6b011679d 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -524,7 +524,7 @@ def get_subflow(self, structure): def _copy_server_fields(source_flow, target_flow): """Recursively copies the fields added by the server from the `source_flow` to the `target_flow`. - + Parameters ---------- source_flow : OpenMLFlow @@ -546,7 +546,7 @@ def _copy_server_fields(source_flow, target_flow): def _add_if_nonempty(dic, key, value): """Adds a key-value pair to a dictionary if the value is not None. - + Parameters ---------- dic: dict diff --git a/openml/flows/functions.py b/openml/flows/functions.py index a84238f36..0c3efd39c 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -338,7 +338,7 @@ def get_flow_id( def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: """Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. - + Parameters ---------- api_call: str @@ -346,7 +346,7 @@ def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.D output_format: str in {"dict", "dataframe"} The output format. Returns - + ------- The flows information in the specified output format. """ diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 645663662..0acbb1fdd 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -129,12 +129,12 @@ def run_model_on_task( def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask: """Retrieve an OpenMLTask object from either an integer or string ID, or directly from an OpenMLTask object. - + Parameters ---------- task : Union[int, str, OpenMLTask] The task ID or the OpenMLTask object. - + Returns ------- OpenMLTask @@ -464,7 +464,7 @@ def _run_task_get_arffcontent( "OrderedDict[str, OrderedDict]", ]: """Runs the hyperparameter optimization on the given task and returns the arfftrace content. - + Parameters ---------- model : Any @@ -479,7 +479,7 @@ def _run_task_get_arffcontent( The format in which to download the dataset. n_jobs : int Number of jobs to run in parallel. If None, use 1 core by default. If -1, use all available cores. - + Returns ------- Tuple[List[List], Optional[OpenMLRunTrace], OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] @@ -671,7 +671,7 @@ def _run_task_get_arffcontent_parallel_helper( "OrderedDict[str, float]", ]: """Helper function that runs a single model on a single task fold sample. - + Parameters ---------- extension : Extension @@ -690,13 +690,13 @@ def _run_task_get_arffcontent_parallel_helper( The dataset format to be used. configuration : Dict Hyperparameters to configure the model. - + Returns ------- Tuple[np.ndarray, Optional[pd.DataFrame], np.ndarray, Optional[pd.DataFrame], Optional[OpenMLRunTrace], OrderedDict[str, float]] - A tuple containing the predictions, probability estimates (if applicable), - actual target values, actual target value probabilities (if applicable), + A tuple containing the predictions, probability estimates (if applicable), + actual target values, actual target value probabilities (if applicable), the trace object of the OpenML run (if applicable), and a dictionary of local measures for this particular fold. """ # Sets up the OpenML instantiated in the child process to match that of the parent's diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 28171900f..1f3808255 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -35,7 +35,7 @@ class OpenMLRunTrace(object): def __init__(self, run_id: int, trace_iterations: List[List]): """Object to hold the trace content of a run. - + Parameters ---------- run_id : int @@ -238,7 +238,7 @@ def trace_from_arff(cls, arff_obj): @classmethod def _trace_from_arff_struct(cls, attributes, content, error_message): """Generate a trace dictionary from ARFF structure. - + Parameters ---------- cls : type @@ -249,7 +249,7 @@ def _trace_from_arff_struct(cls, attributes, content, error_message): List of instances. error_message : str Error message to raise if `setup_string` is in `attributes`. - + Returns ------- OrderedDict @@ -373,19 +373,19 @@ def trace_from_xml(cls, xml): @classmethod def merge_traces(cls, traces: List["OpenMLRunTrace"]) -> "OpenMLRunTrace": """Merge multiple traces into a single trace. - + Parameters ---------- cls : type Type of the trace object to be created. traces : List[OpenMLRunTrace] List of traces to merge. - + Returns ------- OpenMLRunTrace A trace object representing the merged traces. - + Raises ------ ValueError diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 76cce7014..3580de81a 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -62,17 +62,17 @@ def setup_exists(flow) -> int: def _get_cached_setup(setup_id: int): """Load a run from the cache. - + Parameters ---------- setup_id : int ID of the setup to be loaded. - + Returns ------- OpenMLSetup The loaded setup object. - + Raises ------ OpenMLCacheException @@ -289,14 +289,14 @@ def initialize_model(setup_id: int) -> Any: def _to_dict(flow_id: int, openml_parameter_settings): """Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. - + Parameters ---------- flow_id : int ID of the flow. openml_parameter_settings : List[OpenMLParameter] A list of OpenML parameter settings. - + Returns ------- OrderedDict @@ -350,7 +350,7 @@ def _create_setup_from_xml(result_dict, output_format="object"): def _create_setup_parameter_from_xml(result_dict, output_format="object"): """ - Create an OpenMLParameter object or a dictionary from an API xml result. + Create an OpenMLParameter object or a dictionary from an API xml result. """ if output_format == "object": return OpenMLParameter( diff --git a/openml/study/functions.py b/openml/study/functions.py index cf9587a45..f3d19218e 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -108,14 +108,14 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy: def get_nested_ids_from_result_dict(key: str, subkey: str) -> Optional[List]: """Extracts a list of nested IDs from a result dictionary. - + Parameters ---------- key : str Nested OpenML IDs. subkey : str The subkey contains the nested OpenML IDs. - + Returns ------- Optional[List] @@ -606,14 +606,14 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: """Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. - + Parameters ---------- api_call : str The API call for retrieving the list of OpenML studies. output_format : str in {"object", "dataframe"} Format of the output, either 'object' for a dictionary or 'dataframe' for a Pandas DataFrame. - + Returns ------- Union[Dict, pd.DataFrame] diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 2a327ddb0..d54bc4b42 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -231,19 +231,19 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs): def __list_tasks(api_call, output_format="dict"): """Returns a dictionary or a Pandas DataFrame with information about OpenML tasks. - + Parameters ---------- api_call : str The API call specifying which tasks to return. output_format : str in {"dict", "dataframe"} Output format for the returned object. - + Returns ------- Union[Dict, pd.DataFrame] A dictionary or a Pandas DataFrame with information about OpenML tasks. - + Raises ------ ValueError diff --git a/openml/tasks/split.py b/openml/tasks/split.py index 2df3aad72..8112ba41b 100644 --- a/openml/tasks/split.py +++ b/openml/tasks/split.py @@ -137,7 +137,7 @@ def _from_arff_file(cls, filename: str) -> "OpenMLSplit": def from_dataset(self, X, Y, folds, repeats): """Generates a new OpenML dataset object from input data and cross-validation settings. - + Parameters ---------- X : array-like or sparse matrix @@ -148,7 +148,7 @@ def from_dataset(self, X, Y, folds, repeats): Number of cross-validation folds to generate. repeats : int Number of times to repeat the cross-validation process. - + Raises ------ NotImplementedError @@ -158,7 +158,7 @@ def from_dataset(self, X, Y, folds, repeats): def get(self, repeat=0, fold=0, sample=0): """Returns the specified data split from the CrossValidationSplit object. - + Parameters ---------- repeat : int @@ -167,12 +167,12 @@ def get(self, repeat=0, fold=0, sample=0): Index of the fold to retrieve. sample : int Index of the sample to retrieve. - + Returns ------- numpy.ndarray The data split for the specified repeat, fold, and sample. - + Raises ------ ValueError From 125ea1d74c5c29465be008c16a241154a300af30 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Mon, 21 Aug 2023 10:42:36 +0530 Subject: [PATCH 30/48] Update openml/tasks/task.py --- openml/tasks/task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index 26b275572..b4449a030 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -551,8 +551,8 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]": class OpenMLLearningCurveTask(OpenMLClassificationTask): - """OpenML Learning Curve object. - + """OpenML Learning Curve object. + Parameters ---------- task_type_id : TaskType From 7fa83591855ce4bad1c6bafff91d265c8a46340b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 05:12:52 +0000 Subject: [PATCH 31/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/tasks/task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/tasks/task.py b/openml/tasks/task.py index b4449a030..f205bd926 100644 --- a/openml/tasks/task.py +++ b/openml/tasks/task.py @@ -46,7 +46,7 @@ class OpenMLTask(OpenMLBase): Refers to the data. estimation_procedure_id: int Refers to the type of estimates used. - estimation_procedure_type: str, default=None + estimation_procedure_type: str, default=None Refers to the type of estimation procedure used for the OpenML task. estimation_parameters: [Dict[str, str]], default=None Estimation parameters used for the OpenML task. @@ -399,7 +399,7 @@ def __init__( class OpenMLRegressionTask(OpenMLSupervisedTask): """OpenML Regression object. - + Parameters ---------- task_type_id : TaskType From 08055e7372e9723e3789e5415d266b9513c6c3b6 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 11:58:11 +0530 Subject: [PATCH 32/48] Update openml/extensions/sklearn/extension.py --- openml/extensions/sklearn/extension.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index d27b29293..065792c5a 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -2101,7 +2101,8 @@ def instantiate_model_from_hpo_class( return base_estimator def _extract_trace_data(self, model, rep_no, fold_no): - """Extracts data from a machine learning model's cross-validation results and creates an ARFF (Attribute-Relation File Format) trace. + """Extracts data from a machine learning model's cross-validation results + and creates an ARFF (Attribute-Relation File Format) trace. Parameters ---------- From 0d272b6a88ab308d0a71608a2b7309ed4766828f Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 11:59:30 +0530 Subject: [PATCH 33/48] Update openml/flows/flow.py --- openml/flows/flow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 6b011679d..52b813d36 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -523,7 +523,8 @@ def get_subflow(self, structure): def _copy_server_fields(source_flow, target_flow): - """Recursively copies the fields added by the server from the `source_flow` to the `target_flow`. + """Recursively copies the fields added by the server + from the `source_flow` to the `target_flow`. Parameters ---------- From de5fa6c516da313cda57a0336d30c32f7ccde7e0 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 12:00:24 +0530 Subject: [PATCH 34/48] Update openml/flows/functions.py --- openml/flows/functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 0c3efd39c..f07f78e17 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -337,7 +337,8 @@ def get_flow_id( def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: - """Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. + """Retrieve information about flows from OpenML API + and parse it to a dictionary or a Pandas DataFrame. Parameters ---------- From 35353871a44fc9faef469f96d6bca78233081c48 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 06:31:05 +0000 Subject: [PATCH 35/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/extensions/sklearn/extension.py | 2 +- openml/flows/flow.py | 2 +- openml/flows/functions.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py index 065792c5a..4c7a8912d 100644 --- a/openml/extensions/sklearn/extension.py +++ b/openml/extensions/sklearn/extension.py @@ -2101,7 +2101,7 @@ def instantiate_model_from_hpo_class( return base_estimator def _extract_trace_data(self, model, rep_no, fold_no): - """Extracts data from a machine learning model's cross-validation results + """Extracts data from a machine learning model's cross-validation results and creates an ARFF (Attribute-Relation File Format) trace. Parameters diff --git a/openml/flows/flow.py b/openml/flows/flow.py index 52b813d36..4831eb6a7 100644 --- a/openml/flows/flow.py +++ b/openml/flows/flow.py @@ -523,7 +523,7 @@ def get_subflow(self, structure): def _copy_server_fields(source_flow, target_flow): - """Recursively copies the fields added by the server + """Recursively copies the fields added by the server from the `source_flow` to the `target_flow`. Parameters diff --git a/openml/flows/functions.py b/openml/flows/functions.py index f07f78e17..45eea42dc 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -337,7 +337,7 @@ def get_flow_id( def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: - """Retrieve information about flows from OpenML API + """Retrieve information about flows from OpenML API and parse it to a dictionary or a Pandas DataFrame. Parameters From 4cc0abf94be3626b759641228f8b7db3ffe1e473 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 12:01:30 +0530 Subject: [PATCH 36/48] Update openml/runs/functions.py --- openml/runs/functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 0acbb1fdd..4251c7a49 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -128,7 +128,8 @@ def run_model_on_task( flow = extension.model_to_flow(model) def get_task_and_type_conversion(task: Union[int, str, OpenMLTask]) -> OpenMLTask: - """Retrieve an OpenMLTask object from either an integer or string ID, or directly from an OpenMLTask object. + """Retrieve an OpenMLTask object from either an integer or string ID, + or directly from an OpenMLTask object. Parameters ---------- From 554215fb702d887aeec5bf59b8cde974273db380 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 12:09:39 +0530 Subject: [PATCH 37/48] Update openml/runs/functions.py --- openml/runs/functions.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 4251c7a49..6590444cb 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -464,7 +464,8 @@ def _run_task_get_arffcontent( "OrderedDict[str, OrderedDict]", "OrderedDict[str, OrderedDict]", ]: - """Runs the hyperparameter optimization on the given task and returns the arfftrace content. + """Runs the hyperparameter optimization on the given task + and returns the arfftrace content. Parameters ---------- @@ -479,12 +480,15 @@ def _run_task_get_arffcontent( dataset_format : str The format in which to download the dataset. n_jobs : int - Number of jobs to run in parallel. If None, use 1 core by default. If -1, use all available cores. + Number of jobs to run in parallel. + If None, use 1 core by default. If -1, use all available cores. Returns ------- - Tuple[List[List], Optional[OpenMLRunTrace], OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] - A tuple containing the arfftrace content, the OpenML run trace, the global and local evaluation measures. + Tuple[List[List], Optional[OpenMLRunTrace], + OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] + A tuple containing the arfftrace content, + the OpenML run trace, the global and local evaluation measures. """ arff_datacontent = [] # type: List[List] traces = [] # type: List[OpenMLRunTrace] @@ -698,7 +702,8 @@ def _run_task_get_arffcontent_parallel_helper( Optional[OpenMLRunTrace], OrderedDict[str, float]] A tuple containing the predictions, probability estimates (if applicable), actual target values, actual target value probabilities (if applicable), - the trace object of the OpenML run (if applicable), and a dictionary of local measures for this particular fold. + the trace object of the OpenML run (if applicable), + and a dictionary of local measures for this particular fold. """ # Sets up the OpenML instantiated in the child process to match that of the parent's # if configuration=None, loads the default From 4e1765eec22b923805eb2e0ccc753021a88901bd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 06:39:58 +0000 Subject: [PATCH 38/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/runs/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 6590444cb..5e31ed370 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -485,7 +485,7 @@ def _run_task_get_arffcontent( Returns ------- - Tuple[List[List], Optional[OpenMLRunTrace], + Tuple[List[List], Optional[OpenMLRunTrace], OrderedDict[str, OrderedDict], OrderedDict[str, OrderedDict]] A tuple containing the arfftrace content, the OpenML run trace, the global and local evaluation measures. From f97655536692798735a1c5ef1781f044a0bf5226 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 12:10:47 +0530 Subject: [PATCH 39/48] Update openml/setups/functions.py --- openml/setups/functions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 3580de81a..e26770a03 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -288,7 +288,8 @@ def initialize_model(setup_id: int) -> Any: def _to_dict(flow_id: int, openml_parameter_settings): - """Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. + """Convert a flow ID and a list of OpenML parameter settings to + a dictionary representation that can be serialized to XML. Parameters ---------- From 95bfc6940b7b82ad946b384f7448b60194aecf8d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 06:41:08 +0000 Subject: [PATCH 40/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/setups/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index e26770a03..410a1e964 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -288,7 +288,7 @@ def initialize_model(setup_id: int) -> Any: def _to_dict(flow_id: int, openml_parameter_settings): - """Convert a flow ID and a list of OpenML parameter settings to + """Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. Parameters From c0eedfaaa33758d502df99def9ed53ad36acc608 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 12:12:52 +0530 Subject: [PATCH 41/48] Update openml/study/functions.py --- openml/study/functions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/openml/study/functions.py b/openml/study/functions.py index f3d19218e..b10792289 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -605,19 +605,22 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: - """Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. + """Retrieves the list of OpenML studies and + returns it in a dictionary or a Pandas DataFrame. Parameters ---------- api_call : str The API call for retrieving the list of OpenML studies. output_format : str in {"object", "dataframe"} - Format of the output, either 'object' for a dictionary or 'dataframe' for a Pandas DataFrame. + Format of the output, either 'object' for a dictionary + or 'dataframe' for a Pandas DataFrame. Returns ------- Union[Dict, pd.DataFrame] - A dictionary or Pandas DataFrame of OpenML studies, depending on the value of 'output_format'. + A dictionary or Pandas DataFrame of OpenML studies, + depending on the value of 'output_format'. """ xml_string = openml._api_calls._perform_api_call(api_call, "get") study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) From af2eac7ce7b1ee29faa048b7607e411ac53caf17 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 06:43:08 +0000 Subject: [PATCH 42/48] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- openml/study/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/study/functions.py b/openml/study/functions.py index b10792289..05d100ccd 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -605,7 +605,7 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]: - """Retrieves the list of OpenML studies and + """Retrieves the list of OpenML studies and returns it in a dictionary or a Pandas DataFrame. Parameters @@ -619,7 +619,7 @@ def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame Returns ------- Union[Dict, pd.DataFrame] - A dictionary or Pandas DataFrame of OpenML studies, + A dictionary or Pandas DataFrame of OpenML studies, depending on the value of 'output_format'. """ xml_string = openml._api_calls._perform_api_call(api_call, "get") From ae8581fda39ba55e66cadc44aca22fea4706bba4 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Tue, 19 Sep 2023 12:15:26 +0530 Subject: [PATCH 43/48] Update openml/tasks/functions.py --- openml/tasks/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index d54bc4b42..41d8d0197 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -247,8 +247,8 @@ def __list_tasks(api_call, output_format="dict"): Raises ------ ValueError - If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', or has an incorrect value for - '@xmlns:oml'. + If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', + or has an incorrect value for '@xmlns:oml'. KeyError If an invalid key is found in the XML for a task. """ From e3c475a1c10ce68fdb5ebe86de539ed3ec74a7f2 Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Mon, 30 Oct 2023 14:45:50 -0700 Subject: [PATCH 44/48] Update trace.py to fix mypy error --- openml/runs/trace.py | 160 ++++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 78 deletions(-) diff --git a/openml/runs/trace.py b/openml/runs/trace.py index 1f3808255..1b2057c9f 100644 --- a/openml/runs/trace.py +++ b/openml/runs/trace.py @@ -4,7 +4,7 @@ from dataclasses import dataclass import json import os -from typing import List, Tuple, Optional # noqa F401 +from typing import List, Tuple, Optional, Dict, Union # noqa F401 import arff import xmltodict @@ -19,6 +19,82 @@ ] +@dataclass +class OpenMLTraceIteration: + """ + OpenML Trace Iteration: parsed output from Run Trace call + Exactly one of `setup_string` or `parameters` must be provided. + + Parameters + ---------- + repeat : int + repeat number (in case of no repeats: 0) + + fold : int + fold number (in case of no folds: 0) + + iteration : int + iteration number of optimization procedure + + setup_string : str, optional + json string representing the parameters + If not provided, ``parameters`` should be set. + + evaluation : double + The evaluation that was awarded to this trace iteration. + Measure is defined by the task + + selected : bool + Whether this was the best of all iterations, and hence + selected for making predictions. Per fold/repeat there + should be only one iteration selected + + parameters : OrderedDict, optional + Dictionary specifying parameter names and their values. + If not provided, ``setup_string`` should be set. + """ + + repeat: int + fold: int + iteration: int + + evaluation: float + selected: bool + + setup_string: Optional[str] = None + parameters: Optional[OrderedDict] = None + + def __post_init__(self): + # TODO: refactor into one argument of type + if self.setup_string and self.parameters: + raise ValueError( + "Can only be instantiated with either `setup_string` or `parameters` argument." + ) + elif not (self.setup_string or self.parameters): + raise ValueError( + "Either `setup_string` or `parameters` needs to be passed as argument." + ) + if self.parameters is not None and not isinstance(self.parameters, OrderedDict): + raise TypeError( + "argument parameters is not an instance of OrderedDict, but %s" + % str(type(self.parameters)) + ) + + def get_parameters(self): + result = {} + # parameters have prefix 'parameter_' + + if self.setup_string: + for param in self.setup_string: + key = param[len(PREFIX) :] + value = self.setup_string[param] + result[key] = json.loads(value) + else: + for param, value in self.parameters.items(): + result[param[len(PREFIX) :]] = value + return result + + class OpenMLRunTrace(object): """OpenML Run Trace: parsed output from Run Trace call @@ -33,7 +109,11 @@ class OpenMLRunTrace(object): """ - def __init__(self, run_id: int, trace_iterations: List[List]): + def __init__( + self, + run_id: Union[int, None], + trace_iterations: Dict[Tuple[int, int, int], OpenMLTraceIteration], + ): """Object to hold the trace content of a run. Parameters @@ -431,79 +511,3 @@ def __repr__(self): def __iter__(self): for val in self.trace_iterations.values(): yield val - - -@dataclass -class OpenMLTraceIteration: - """ - OpenML Trace Iteration: parsed output from Run Trace call - Exactly one of `setup_string` or `parameters` must be provided. - - Parameters - ---------- - repeat : int - repeat number (in case of no repeats: 0) - - fold : int - fold number (in case of no folds: 0) - - iteration : int - iteration number of optimization procedure - - setup_string : str, optional - json string representing the parameters - If not provided, ``parameters`` should be set. - - evaluation : double - The evaluation that was awarded to this trace iteration. - Measure is defined by the task - - selected : bool - Whether this was the best of all iterations, and hence - selected for making predictions. Per fold/repeat there - should be only one iteration selected - - parameters : OrderedDict, optional - Dictionary specifying parameter names and their values. - If not provided, ``setup_string`` should be set. - """ - - repeat: int - fold: int - iteration: int - - evaluation: float - selected: bool - - setup_string: Optional[str] = None - parameters: Optional[OrderedDict] = None - - def __post_init__(self): - # TODO: refactor into one argument of type - if self.setup_string and self.parameters: - raise ValueError( - "Can only be instantiated with either `setup_string` or `parameters` argument." - ) - elif not (self.setup_string or self.parameters): - raise ValueError( - "Either `setup_string` or `parameters` needs to be passed as argument." - ) - if self.parameters is not None and not isinstance(self.parameters, OrderedDict): - raise TypeError( - "argument parameters is not an instance of OrderedDict, but %s" - % str(type(self.parameters)) - ) - - def get_parameters(self): - result = {} - # parameters have prefix 'parameter_' - - if self.setup_string: - for param in self.setup_string: - key = param[len(PREFIX) :] - value = self.setup_string[param] - result[key] = json.loads(value) - else: - for param, value in self.parameters.items(): - result[param[len(PREFIX) :]] = value - return result From 51798e0de3b48b17b24367b9da16316efdd07429 Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Mon, 30 Oct 2023 14:46:23 -0700 Subject: [PATCH 45/48] Update functions.py to fix mypy error --- openml/flows/functions.py | 701 +++++++++++++------------------------- 1 file changed, 246 insertions(+), 455 deletions(-) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index 45eea42dc..bc6d21aaa 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -1,586 +1,377 @@ # License: BSD 3-Clause import warnings - -import dateutil.parser from collections import OrderedDict -import os import io -import re +import os +from typing import Any, Union, List, Dict, Optional + import xmltodict import pandas as pd -from typing import Any, Union, Dict, Optional, List -from ..exceptions import OpenMLCacheException -import openml._api_calls -from . import OpenMLFlow +import openml +from .. import config +from .setup import OpenMLSetup, OpenMLParameter +from openml.flows import flow_exists +import openml.exceptions import openml.utils -FLOWS_CACHE_DIR_NAME = "flows" - - -def _get_cached_flows() -> OrderedDict: - """Return all the cached flows. - - Returns - ------- - flows : OrderedDict - Dictionary with flows. Each flow is an instance of OpenMLFlow. +def setup_exists(flow) -> int: """ - flows = OrderedDict() # type: 'OrderedDict[int, OpenMLFlow]' - - flow_cache_dir = openml.utils._create_cache_directory(FLOWS_CACHE_DIR_NAME) - directory_content = os.listdir(flow_cache_dir) - directory_content.sort() - # Find all flow ids for which we have downloaded - # the flow description - - for filename in directory_content: - if not re.match(r"[0-9]*", filename): - continue - - fid = int(filename) - flows[fid] = _get_cached_flow(fid) - - return flows - - -def _get_cached_flow(fid: int) -> OpenMLFlow: - """Get the cached flow with the given id. + Checks whether a hyperparameter configuration already exists on the server. Parameters ---------- - fid : int - Flow id. + flow : flow + The openml flow object. Should have flow id present for the main flow + and all subflows (i.e., it should be downloaded from the server by + means of flow.get, and not instantiated locally) Returns ------- - OpenMLFlow. + setup_id : int + setup id iff exists, False otherwise """ + # sadly, this api call relies on a run object + openml.flows.functions._check_flow_for_server_id(flow) + if flow.model is None: + raise ValueError("Flow should have model field set with the actual model.") + if flow.extension is None: + raise ValueError("Flow should have model field set with the correct extension.") + + # checks whether the flow exists on the server and flow ids align + exists = flow_exists(flow.name, flow.external_version) + if exists != flow.flow_id: + raise ValueError( + f"Local flow id ({flow.id}) differs from server id ({exists}). " + "If this issue persists, please contact the developers." + ) - fid_cache_dir = openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, fid) - flow_file = os.path.join(fid_cache_dir, "flow.xml") - - try: - with io.open(flow_file, encoding="utf8") as fh: - return _create_flow_from_xml(fh.read()) - except (OSError, IOError): - openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir) - raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) + openml_param_settings = flow.extension.obtain_parameter_values(flow) + description = xmltodict.unparse(_to_dict(flow.flow_id, openml_param_settings), pretty=True) + file_elements = { + "description": ("description.arff", description) + } # type: openml._api_calls.FILE_ELEMENTS_TYPE + result = openml._api_calls._perform_api_call( + "/setup/exists/", "post", file_elements=file_elements + ) + result_dict = xmltodict.parse(result) + setup_id = int(result_dict["oml:setup_exists"]["oml:id"]) + return setup_id if setup_id > 0 else False -@openml.utils.thread_safe_if_oslo_installed -def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow: - """Download the OpenML flow for a given flow ID. +def _get_cached_setup(setup_id: int): + """Load a run from the cache. Parameters ---------- - flow_id : int - The OpenML flow id. - - reinstantiate: bool - Whether to reinstantiate the flow to a model instance. - - strict_version : bool, default=True - Whether to fail if version requirements are not fulfilled. + setup_id : int + ID of the setup to be loaded. Returns ------- - flow : OpenMLFlow - the flow - """ - flow_id = int(flow_id) - flow = _get_flow_description(flow_id) + OpenMLSetup + The loaded setup object. - if reinstantiate: - flow.model = flow.extension.flow_to_model(flow, strict_version=strict_version) - if not strict_version: - # check if we need to return a new flow b/c of version mismatch - new_flow = flow.extension.model_to_flow(flow.model) - if new_flow.dependencies != flow.dependencies: - return new_flow - return flow + Raises + ------ + OpenMLCacheException + If the setup file for the given setup ID is not cached. + """ + cache_dir = config.get_cache_directory() + setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id)) + try: + setup_file = os.path.join(setup_cache_dir, "description.xml") + with io.open(setup_file, encoding="utf8") as fh: + setup_xml = xmltodict.parse(fh.read()) + setup = _create_setup_from_xml(setup_xml, output_format="object") + return setup + except (OSError, IOError): + raise openml.exceptions.OpenMLCacheException( + "Setup file for setup id %d not cached" % setup_id + ) -def _get_flow_description(flow_id: int) -> OpenMLFlow: - """Get the Flow for a given ID. - Does the real work for get_flow. It returns a cached flow - instance if the flow exists locally, otherwise it downloads the - flow and returns an instance created from the xml representation. +def get_setup(setup_id): + """ + Downloads the setup (configuration) description from OpenML + and returns a structured object Parameters ---------- - flow_id : int - The OpenML flow id. + setup_id : int + The Openml setup_id Returns ------- - OpenMLFlow + dict or OpenMLSetup(an initialized openml setup object) """ - try: - return _get_cached_flow(flow_id) - except OpenMLCacheException: - xml_file = os.path.join( - openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id), - "flow.xml", - ) + setup_dir = os.path.join(config.get_cache_directory(), "setups", str(setup_id)) + setup_file = os.path.join(setup_dir, "description.xml") + + if not os.path.exists(setup_dir): + os.makedirs(setup_dir) - flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get") - with io.open(xml_file, "w", encoding="utf8") as fh: - fh.write(flow_xml) + try: + return _get_cached_setup(setup_id) + except openml.exceptions.OpenMLCacheException: + url_suffix = "/setup/%d" % setup_id + setup_xml = openml._api_calls._perform_api_call(url_suffix, "get") + with io.open(setup_file, "w", encoding="utf8") as fh: + fh.write(setup_xml) - return _create_flow_from_xml(flow_xml) + result_dict = xmltodict.parse(setup_xml) + return _create_setup_from_xml(result_dict, output_format="object") -def list_flows( +def list_setups( offset: Optional[int] = None, size: Optional[int] = None, + flow: Optional[int] = None, tag: Optional[str] = None, - output_format: str = "dict", - **kwargs + setup: Optional[List] = None, + output_format: str = "object", ) -> Union[Dict, pd.DataFrame]: """ - Return a list of all flows which are on OpenML. - (Supports large amount of results) + List all setups matching all of the given filters. Parameters ---------- offset : int, optional - the number of flows to skip, starting from the first size : int, optional - the maximum number of flows to return + flow : int, optional tag : str, optional - the tag to include - output_format: str, optional (default='dict') + setup : list(int), optional + output_format: str, optional (default='object') The parameter decides the format of the output. + - If 'object' the output is a dict of OpenMLSetup objects - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame - kwargs: dict, optional - Legal filter operators: uploader. Returns ------- - flows : dict of dicts, or dataframe - - If output_format='dict' - A mapping from flow_id to a dict giving a brief overview of the - respective flow. - Every flow is represented by a dictionary containing - the following information: - - flow id - - full name - - name - - version - - external version - - uploader - - - If output_format='dataframe' - Each row maps to a dataset - Each column contains the following information: - - flow id - - full name - - name - - version - - external version - - uploader + dict or dataframe """ - if output_format not in ["dataframe", "dict"]: + if output_format not in ["dataframe", "dict", "object"]: raise ValueError( - "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable." ) # TODO: [0.15] if output_format == "dict": msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15 " - "and pandas dataframes will be returned instead. To ensure your code " - "will continue to work, use `output_format`='dataframe'." + "Support for `output_format` of 'dict' will be removed in 0.15. " + "To ensure your code will continue to work, " + "use `output_format`='dataframe' or `output_format`='object'." ) warnings.warn(msg, category=FutureWarning, stacklevel=2) + batch_size = 1000 # batch size for setups is lower return openml.utils._list_all( output_format=output_format, - listing_call=_list_flows, + listing_call=_list_setups, offset=offset, size=size, + flow=flow, tag=tag, - **kwargs + setup=setup, + batch_size=batch_size, ) -def _list_flows(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: +def _list_setups(setup=None, output_format="object", **kwargs): """ - Perform the api call that return a list of all flows. + Perform API call `/setup/list/{filters}` Parameters ---------- + The setup argument that is a list is separated from the single value + filters which are put into the kwargs. + + setup : list(int), optional + output_format: str, optional (default='dict') The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame kwargs: dict, optional - Legal filter operators: uploader, tag, limit, offset. + Legal filter operators: flow, setup, limit, offset, tag. Returns ------- - flows : dict, or dataframe + dict or dataframe """ - api_call = "flow/list" + api_call = "setup/list" + if setup is not None: + api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) if kwargs is not None: for operator, value in kwargs.items(): api_call += "/%s/%s" % (operator, value) - return __list_flows(api_call=api_call, output_format=output_format) + return __list_setups(api_call=api_call, output_format=output_format) -def flow_exists(name: str, external_version: str) -> Union[int, bool]: - """Retrieves the flow id. - - A flow is uniquely identified by name + external_version. - - Parameters - ---------- - name : string - Name of the flow - external_version : string - Version information associated with flow. +def __list_setups(api_call, output_format="object"): + """Helper function to parse API calls which are lists of setups""" + xml_string = openml._api_calls._perform_api_call(api_call, "get") + setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",)) + openml_uri = "http://openml.org/openml" + # Minimalistic check if the XML is useful + if "oml:setups" not in setups_dict: + raise ValueError( + 'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict) + ) + elif "@xmlns:oml" not in setups_dict["oml:setups"]: + raise ValueError( + "Error in return XML, does not contain " + '"oml:setups"/@xmlns:oml: %s' % str(setups_dict) + ) + elif setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: + raise ValueError( + "Error in return XML, value of " + '"oml:seyups"/@xmlns:oml is not ' + '"%s": %s' % (openml_uri, str(setups_dict)) + ) - Returns - ------- - flow_exist : int or bool - flow id iff exists, False otherwise + assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"]) - Notes - ----- - see https://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version - """ - if not (isinstance(name, str) and len(name) > 0): - raise ValueError("Argument 'name' should be a non-empty string") - if not (isinstance(name, str) and len(external_version) > 0): - raise ValueError("Argument 'version' should be a non-empty string") - - xml_response = openml._api_calls._perform_api_call( - "flow/exists", - "post", - data={"name": name, "external_version": external_version}, - ) - - result_dict = xmltodict.parse(xml_response) - flow_id = int(result_dict["oml:flow_exists"]["oml:id"]) - return flow_id if flow_id > 0 else False + setups = dict() + for setup_ in setups_dict["oml:setups"]["oml:setup"]: + # making it a dict to give it the right format + current = _create_setup_from_xml( + {"oml:setup_parameters": setup_}, output_format=output_format + ) + if output_format == "object": + setups[current.setup_id] = current + else: + setups[current["setup_id"]] = current + if output_format == "dataframe": + setups = pd.DataFrame.from_dict(setups, orient="index") -def get_flow_id( - model: Optional[Any] = None, - name: Optional[str] = None, - exact_version=True, -) -> Union[int, bool, List[int]]: - """Retrieves the flow id for a model or a flow name. + return setups - Provide either a model or a name to this function. Depending on the input, it does - * ``model`` and ``exact_version == True``: This helper function first queries for the necessary - extension. Second, it uses that extension to convert the model into a flow. Third, it - executes ``flow_exists`` to potentially obtain the flow id the flow is published to the - server. - * ``model`` and ``exact_version == False``: This helper function first queries for the - necessary extension. Second, it uses that extension to convert the model into a flow. Third - it calls ``list_flows`` and filters the returned values based on the flow name. - * ``name``: Ignores ``exact_version`` and calls ``list_flows``, then filters the returned - values based on the flow name. +def initialize_model(setup_id: int) -> Any: + """ + Initialized a model based on a setup_id (i.e., using the exact + same parameter settings) Parameters ---------- - model : object - Any model. Must provide either ``model`` or ``name``. - name : str - Name of the flow. Must provide either ``model`` or ``name``. - exact_version : bool - Whether to return the flow id of the exact version or all flow ids where the name - of the flow matches. This is only taken into account for a model where a version number - is available. + setup_id : int + The Openml setup_id Returns ------- - int or bool, List - flow id iff exists, ``False`` otherwise, List if ``exact_version is False`` + model """ - if model is None and name is None: - raise ValueError( - "Need to provide either argument `model` or argument `name`, but both are `None`." - ) - elif model is not None and name is not None: - raise ValueError("Must provide either argument `model` or argument `name`, but not both.") - - if model is not None: - extension = openml.extensions.get_extension_by_model(model, raise_if_no_extension=True) - if extension is None: - # This should never happen and is only here to please mypy will be gone soon once the - # whole function is removed - raise TypeError(extension) - flow = extension.model_to_flow(model) - flow_name = flow.name - external_version = flow.external_version - else: - flow_name = name - exact_version = False + setup = get_setup(setup_id) + flow = openml.flows.get_flow(setup.flow_id) + + # instead of using scikit-learns or any other library's "set_params" function, we override the + # OpenMLFlow objects default parameter value so we can utilize the + # Extension.flow_to_model() function to reinitialize the flow with the set defaults. + for hyperparameter in setup.parameters.values(): + structure = flow.get_structure("flow_id") + if len(structure[hyperparameter.flow_id]) > 0: + subflow = flow.get_subflow(structure[hyperparameter.flow_id]) + else: + subflow = flow + subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value - if exact_version: - return flow_exists(name=flow_name, external_version=external_version) - else: - flows = list_flows(output_format="dataframe") - assert isinstance(flows, pd.DataFrame) # Make mypy happy - flows = flows.query('name == "{}"'.format(flow_name)) - return flows["id"].to_list() + model = flow.extension.flow_to_model(flow) + return model -def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: - """Retrieve information about flows from OpenML API - and parse it to a dictionary or a Pandas DataFrame. +def _to_dict(flow_id: int, openml_parameter_settings) -> OrderedDict: + """Convert a flow ID and a list of OpenML parameter settings to + a dictionary representation that can be serialized to XML. Parameters ---------- - api_call: str - Retrieves the information about flows. - output_format: str in {"dict", "dataframe"} - The output format. - Returns + flow_id : int + ID of the flow. + openml_parameter_settings : List[OpenMLParameter] + A list of OpenML parameter settings. + Returns ------- - The flows information in the specified output format. + OrderedDict + A dictionary representation of the flow ID and parameter settings. """ - xml_string = openml._api_calls._perform_api_call(api_call, "get") - flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",)) - - # Minimalistic check if the XML is useful - assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"]) - assert flows_dict["oml:flows"]["@xmlns:oml"] == "http://openml.org/openml", flows_dict[ - "oml:flows" - ]["@xmlns:oml"] - - flows = dict() - for flow_ in flows_dict["oml:flows"]["oml:flow"]: - fid = int(flow_["oml:id"]) - flow = { - "id": fid, - "full_name": flow_["oml:full_name"], - "name": flow_["oml:name"], - "version": flow_["oml:version"], - "external_version": flow_["oml:external_version"], - "uploader": flow_["oml:uploader"], - } - flows[fid] = flow - - if output_format == "dataframe": - flows = pd.DataFrame.from_dict(flows, orient="index") - - return flows - - -def _check_flow_for_server_id(flow: OpenMLFlow) -> None: - """Raises a ValueError if the flow or any of its subflows has no flow id.""" - - # Depth-first search to check if all components were uploaded to the - # server before parsing the parameters - stack = list() - stack.append(flow) - while len(stack) > 0: - current = stack.pop() - if current.flow_id is None: - raise ValueError("Flow %s has no flow_id!" % current.name) - else: - for component in current.components.values(): - stack.append(component) - - -def assert_flows_equal( - flow1: OpenMLFlow, - flow2: OpenMLFlow, - ignore_parameter_values_on_older_children: Optional[str] = None, - ignore_parameter_values: bool = False, - ignore_custom_name_if_none: bool = False, - check_description: bool = True, -) -> None: - """Check equality of two flows. - - Two flows are equal if their all keys which are not set by the server - are equal, as well as all their parameters and components. + # for convenience, this function (ab)uses the run object. + xml: OrderedDict = OrderedDict() + xml["oml:run"] = OrderedDict() + xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" + xml["oml:run"]["oml:flow_id"] = flow_id + xml["oml:run"]["oml:parameter_setting"] = openml_parameter_settings - Parameters - ---------- - flow1 : OpenMLFlow - - flow2 : OpenMLFlow - - ignore_parameter_values_on_older_children : str (optional) - If set to ``OpenMLFlow.upload_date``, ignores parameters in a child - flow if it's upload date predates the upload date of the parent flow. + return xml - ignore_parameter_values : bool - Whether to ignore parameter values when comparing flows. - ignore_custom_name_if_none : bool - Whether to ignore the custom name field if either flow has `custom_name` equal to `None`. - - check_description : bool - Whether to ignore matching of flow descriptions. +def _create_setup_from_xml(result_dict, output_format="object"): + """ + Turns an API xml result into a OpenMLSetup object (or dict) """ - if not isinstance(flow1, OpenMLFlow): - raise TypeError("Argument 1 must be of type OpenMLFlow, but is %s" % type(flow1)) - - if not isinstance(flow2, OpenMLFlow): - raise TypeError("Argument 2 must be of type OpenMLFlow, but is %s" % type(flow2)) - - # TODO as they are actually now saved during publish, it might be good to - # check for the equality of these as well. - generated_by_the_server = [ - "flow_id", - "uploader", - "version", - "upload_date", - # Tags aren't directly created by the server, - # but the uploader has no control over them! - "tags", - ] - ignored_by_python_api = ["binary_url", "binary_format", "binary_md5", "model", "_entity_id"] - - for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()): - if key in generated_by_the_server + ignored_by_python_api: - continue - attr1 = getattr(flow1, key, None) - attr2 = getattr(flow2, key, None) - if key == "components": - if not (isinstance(attr1, Dict) and isinstance(attr2, Dict)): - raise TypeError("Cannot compare components because they are not dictionary.") - - for name in set(attr1.keys()).union(attr2.keys()): - if name not in attr1: - raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name - ) - if name not in attr2: - raise ValueError( - "Component %s only available in " "argument2, but not in argument1." % name - ) - assert_flows_equal( - attr1[name], - attr2[name], - ignore_parameter_values_on_older_children, - ignore_parameter_values, - ignore_custom_name_if_none, + setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"]) + flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"]) + parameters = {} + if "oml:parameter" not in result_dict["oml:setup_parameters"]: + parameters = None + else: + # basically all others + xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"] + if isinstance(xml_parameters, dict): + id = int(xml_parameters["oml:id"]) + parameters[id] = _create_setup_parameter_from_xml( + result_dict=xml_parameters, output_format=output_format + ) + elif isinstance(xml_parameters, list): + for xml_parameter in xml_parameters: + id = int(xml_parameter["oml:id"]) + parameters[id] = _create_setup_parameter_from_xml( + result_dict=xml_parameter, output_format=output_format ) - elif key == "_extension": - continue - elif check_description and key == "description": - # to ignore matching of descriptions since sklearn based flows may have - # altering docstrings and is not guaranteed to be consistent - continue else: - if key == "parameters": - if ignore_parameter_values or ignore_parameter_values_on_older_children: - params_flow_1 = set(flow1.parameters.keys()) - params_flow_2 = set(flow2.parameters.keys()) - symmetric_difference = params_flow_1 ^ params_flow_2 - if len(symmetric_difference) > 0: - raise ValueError( - "Flow %s: parameter set of flow " - "differs from the parameters stored " - "on the server." % flow1.name - ) - - if ignore_parameter_values_on_older_children: - upload_date_current_flow = dateutil.parser.parse(flow1.upload_date) - upload_date_parent_flow = dateutil.parser.parse( - ignore_parameter_values_on_older_children - ) - if upload_date_current_flow < upload_date_parent_flow: - continue - - if ignore_parameter_values: - # Continue needs to be done here as the first if - # statement triggers in both special cases - continue - elif ( - key == "custom_name" - and ignore_custom_name_if_none - and (attr1 is None or attr2 is None) - ): - # If specified, we allow `custom_name` inequality if one flow's name is None. - # Helps with backwards compatibility as `custom_name` is now auto-generated, but - # before it used to be `None`. - continue - elif key == "parameters_meta_info": - # this value is a dictionary where each key is a parameter name, containing another - # dictionary with keys specifying the parameter's 'description' and 'data_type' - # checking parameter descriptions can be ignored since that might change - # data type check can also be ignored if one of them is not defined, i.e., None - params1 = set(flow1.parameters_meta_info) - params2 = set(flow2.parameters_meta_info) - if params1 != params2: - raise ValueError( - "Parameter list in meta info for parameters differ " "in the two flows." - ) - # iterating over the parameter's meta info list - for param in params1: - if ( - isinstance(flow1.parameters_meta_info[param], Dict) - and isinstance(flow2.parameters_meta_info[param], Dict) - and "data_type" in flow1.parameters_meta_info[param] - and "data_type" in flow2.parameters_meta_info[param] - ): - value1 = flow1.parameters_meta_info[param]["data_type"] - value2 = flow2.parameters_meta_info[param]["data_type"] - else: - value1 = flow1.parameters_meta_info[param] - value2 = flow2.parameters_meta_info[param] - if value1 is None or value2 is None: - continue - elif value1 != value2: - raise ValueError( - "Flow {}: data type for parameter {} in {} differ " - "as {}\nvs\n{}".format(flow1.name, param, key, value1, value2) - ) - # the continue is to avoid the 'attr != attr2' check at end of function - continue - - if attr1 != attr2: - raise ValueError( - "Flow %s: values for attribute '%s' differ: " - "'%s'\nvs\n'%s'." % (str(flow1.name), str(key), str(attr1), str(attr2)) - ) - + raise ValueError( + "Expected None, list or dict, received " + "something else: %s" % str(type(xml_parameters)) + ) -def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow: - """Create flow object from xml + if output_format in ["dataframe", "dict"]: + return_dict = {"setup_id": setup_id, "flow_id": flow_id} + return_dict["parameters"] = parameters + return return_dict + return OpenMLSetup(setup_id, flow_id, parameters) - Parameters - ---------- - flow_xml: xml representation of a flow - Returns - ------- - OpenMLFlow +def _create_setup_parameter_from_xml(result_dict, output_format="object"): """ - - return OpenMLFlow._from_dict(xmltodict.parse(flow_xml)) - - -def delete_flow(flow_id: int) -> bool: - """Delete flow with id `flow_id` from the OpenML server. - - You can only delete flows which you uploaded and which - which are not linked to runs. - - Parameters - ---------- - flow_id : int - OpenML id of the flow - - Returns - ------- - bool - True if the deletion was successful. False otherwise. + Create an OpenMLParameter object or a dictionary from an API xml result. """ - return openml.utils._delete_entity("flow", flow_id) + if output_format == "object": + return OpenMLParameter( + input_id=int(result_dict["oml:id"]), + flow_id=int(result_dict["oml:flow_id"]), + flow_name=result_dict["oml:flow_name"], + full_name=result_dict["oml:full_name"], + parameter_name=result_dict["oml:parameter_name"], + data_type=result_dict["oml:data_type"], + default_value=result_dict["oml:default_value"], + value=result_dict["oml:value"], + ) + else: + return { + "input_id": int(result_dict["oml:id"]), + "flow_id": int(result_dict["oml:flow_id"]), + "flow_name": result_dict["oml:flow_name"], + "full_name": result_dict["oml:full_name"], + "parameter_name": result_dict["oml:parameter_name"], + "data_type": result_dict["oml:data_type"], + "default_value": result_dict["oml:default_value"], + "value": result_dict["oml:value"], + } From 3ab73e0b4da5d7cf57b73f242e27239c65dd67dc Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Mon, 30 Oct 2023 14:49:00 -0700 Subject: [PATCH 46/48] fix copy paste error --- openml/flows/functions.py | 701 +++++++++++++++++++++++++------------- 1 file changed, 455 insertions(+), 246 deletions(-) diff --git a/openml/flows/functions.py b/openml/flows/functions.py index bc6d21aaa..45eea42dc 100644 --- a/openml/flows/functions.py +++ b/openml/flows/functions.py @@ -1,377 +1,586 @@ # License: BSD 3-Clause import warnings + +import dateutil.parser from collections import OrderedDict -import io import os -from typing import Any, Union, List, Dict, Optional - +import io +import re import xmltodict import pandas as pd +from typing import Any, Union, Dict, Optional, List -import openml -from .. import config -from .setup import OpenMLSetup, OpenMLParameter -from openml.flows import flow_exists -import openml.exceptions +from ..exceptions import OpenMLCacheException +import openml._api_calls +from . import OpenMLFlow import openml.utils -def setup_exists(flow) -> int: - """ - Checks whether a hyperparameter configuration already exists on the server. +FLOWS_CACHE_DIR_NAME = "flows" - Parameters - ---------- - flow : flow - The openml flow object. Should have flow id present for the main flow - and all subflows (i.e., it should be downloaded from the server by - means of flow.get, and not instantiated locally) + +def _get_cached_flows() -> OrderedDict: + """Return all the cached flows. Returns ------- - setup_id : int - setup id iff exists, False otherwise + flows : OrderedDict + Dictionary with flows. Each flow is an instance of OpenMLFlow. """ - # sadly, this api call relies on a run object - openml.flows.functions._check_flow_for_server_id(flow) - if flow.model is None: - raise ValueError("Flow should have model field set with the actual model.") - if flow.extension is None: - raise ValueError("Flow should have model field set with the correct extension.") - - # checks whether the flow exists on the server and flow ids align - exists = flow_exists(flow.name, flow.external_version) - if exists != flow.flow_id: - raise ValueError( - f"Local flow id ({flow.id}) differs from server id ({exists}). " - "If this issue persists, please contact the developers." - ) + flows = OrderedDict() # type: 'OrderedDict[int, OpenMLFlow]' - openml_param_settings = flow.extension.obtain_parameter_values(flow) - description = xmltodict.unparse(_to_dict(flow.flow_id, openml_param_settings), pretty=True) - file_elements = { - "description": ("description.arff", description) - } # type: openml._api_calls.FILE_ELEMENTS_TYPE - result = openml._api_calls._perform_api_call( - "/setup/exists/", "post", file_elements=file_elements - ) - result_dict = xmltodict.parse(result) - setup_id = int(result_dict["oml:setup_exists"]["oml:id"]) - return setup_id if setup_id > 0 else False + flow_cache_dir = openml.utils._create_cache_directory(FLOWS_CACHE_DIR_NAME) + directory_content = os.listdir(flow_cache_dir) + directory_content.sort() + # Find all flow ids for which we have downloaded + # the flow description + + for filename in directory_content: + if not re.match(r"[0-9]*", filename): + continue + + fid = int(filename) + flows[fid] = _get_cached_flow(fid) + + return flows -def _get_cached_setup(setup_id: int): - """Load a run from the cache. +def _get_cached_flow(fid: int) -> OpenMLFlow: + """Get the cached flow with the given id. Parameters ---------- - setup_id : int - ID of the setup to be loaded. + fid : int + Flow id. Returns ------- - OpenMLSetup - The loaded setup object. - - Raises - ------ - OpenMLCacheException - If the setup file for the given setup ID is not cached. + OpenMLFlow. """ - cache_dir = config.get_cache_directory() - setup_cache_dir = os.path.join(cache_dir, "setups", str(setup_id)) - try: - setup_file = os.path.join(setup_cache_dir, "description.xml") - with io.open(setup_file, encoding="utf8") as fh: - setup_xml = xmltodict.parse(fh.read()) - setup = _create_setup_from_xml(setup_xml, output_format="object") - return setup + fid_cache_dir = openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, fid) + flow_file = os.path.join(fid_cache_dir, "flow.xml") + + try: + with io.open(flow_file, encoding="utf8") as fh: + return _create_flow_from_xml(fh.read()) except (OSError, IOError): - raise openml.exceptions.OpenMLCacheException( - "Setup file for setup id %d not cached" % setup_id - ) + openml.utils._remove_cache_dir_for_id(FLOWS_CACHE_DIR_NAME, fid_cache_dir) + raise OpenMLCacheException("Flow file for fid %d not " "cached" % fid) -def get_setup(setup_id): - """ - Downloads the setup (configuration) description from OpenML - and returns a structured object +@openml.utils.thread_safe_if_oslo_installed +def get_flow(flow_id: int, reinstantiate: bool = False, strict_version: bool = True) -> OpenMLFlow: + """Download the OpenML flow for a given flow ID. Parameters ---------- - setup_id : int - The Openml setup_id + flow_id : int + The OpenML flow id. + + reinstantiate: bool + Whether to reinstantiate the flow to a model instance. + + strict_version : bool, default=True + Whether to fail if version requirements are not fulfilled. Returns ------- - dict or OpenMLSetup(an initialized openml setup object) + flow : OpenMLFlow + the flow """ - setup_dir = os.path.join(config.get_cache_directory(), "setups", str(setup_id)) - setup_file = os.path.join(setup_dir, "description.xml") + flow_id = int(flow_id) + flow = _get_flow_description(flow_id) + + if reinstantiate: + flow.model = flow.extension.flow_to_model(flow, strict_version=strict_version) + if not strict_version: + # check if we need to return a new flow b/c of version mismatch + new_flow = flow.extension.model_to_flow(flow.model) + if new_flow.dependencies != flow.dependencies: + return new_flow + return flow + - if not os.path.exists(setup_dir): - os.makedirs(setup_dir) +def _get_flow_description(flow_id: int) -> OpenMLFlow: + """Get the Flow for a given ID. + Does the real work for get_flow. It returns a cached flow + instance if the flow exists locally, otherwise it downloads the + flow and returns an instance created from the xml representation. + + Parameters + ---------- + flow_id : int + The OpenML flow id. + + Returns + ------- + OpenMLFlow + """ try: - return _get_cached_setup(setup_id) - except openml.exceptions.OpenMLCacheException: - url_suffix = "/setup/%d" % setup_id - setup_xml = openml._api_calls._perform_api_call(url_suffix, "get") - with io.open(setup_file, "w", encoding="utf8") as fh: - fh.write(setup_xml) + return _get_cached_flow(flow_id) + except OpenMLCacheException: + xml_file = os.path.join( + openml.utils._create_cache_directory_for_id(FLOWS_CACHE_DIR_NAME, flow_id), + "flow.xml", + ) + + flow_xml = openml._api_calls._perform_api_call("flow/%d" % flow_id, request_method="get") + with io.open(xml_file, "w", encoding="utf8") as fh: + fh.write(flow_xml) - result_dict = xmltodict.parse(setup_xml) - return _create_setup_from_xml(result_dict, output_format="object") + return _create_flow_from_xml(flow_xml) -def list_setups( +def list_flows( offset: Optional[int] = None, size: Optional[int] = None, - flow: Optional[int] = None, tag: Optional[str] = None, - setup: Optional[List] = None, - output_format: str = "object", + output_format: str = "dict", + **kwargs ) -> Union[Dict, pd.DataFrame]: """ - List all setups matching all of the given filters. + Return a list of all flows which are on OpenML. + (Supports large amount of results) Parameters ---------- offset : int, optional + the number of flows to skip, starting from the first size : int, optional - flow : int, optional + the maximum number of flows to return tag : str, optional - setup : list(int), optional - output_format: str, optional (default='object') + the tag to include + output_format: str, optional (default='dict') The parameter decides the format of the output. - - If 'object' the output is a dict of OpenMLSetup objects - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame + kwargs: dict, optional + Legal filter operators: uploader. Returns ------- - dict or dataframe + flows : dict of dicts, or dataframe + - If output_format='dict' + A mapping from flow_id to a dict giving a brief overview of the + respective flow. + Every flow is represented by a dictionary containing + the following information: + - flow id + - full name + - name + - version + - external version + - uploader + + - If output_format='dataframe' + Each row maps to a dataset + Each column contains the following information: + - flow id + - full name + - name + - version + - external version + - uploader """ - if output_format not in ["dataframe", "dict", "object"]: + if output_format not in ["dataframe", "dict"]: raise ValueError( - "Invalid output format selected. " "Only 'dict', 'object', or 'dataframe' applicable." + "Invalid output format selected. " "Only 'dict' or 'dataframe' applicable." ) # TODO: [0.15] if output_format == "dict": msg = ( - "Support for `output_format` of 'dict' will be removed in 0.15. " - "To ensure your code will continue to work, " - "use `output_format`='dataframe' or `output_format`='object'." + "Support for `output_format` of 'dict' will be removed in 0.15 " + "and pandas dataframes will be returned instead. To ensure your code " + "will continue to work, use `output_format`='dataframe'." ) warnings.warn(msg, category=FutureWarning, stacklevel=2) - batch_size = 1000 # batch size for setups is lower return openml.utils._list_all( output_format=output_format, - listing_call=_list_setups, + listing_call=_list_flows, offset=offset, size=size, - flow=flow, tag=tag, - setup=setup, - batch_size=batch_size, + **kwargs ) -def _list_setups(setup=None, output_format="object", **kwargs): +def _list_flows(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]: """ - Perform API call `/setup/list/{filters}` + Perform the api call that return a list of all flows. Parameters ---------- - The setup argument that is a list is separated from the single value - filters which are put into the kwargs. - - setup : list(int), optional - output_format: str, optional (default='dict') The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame kwargs: dict, optional - Legal filter operators: flow, setup, limit, offset, tag. + Legal filter operators: uploader, tag, limit, offset. Returns ------- - dict or dataframe + flows : dict, or dataframe """ + api_call = "flow/list" - api_call = "setup/list" - if setup is not None: - api_call += "/setup/%s" % ",".join([str(int(i)) for i in setup]) if kwargs is not None: for operator, value in kwargs.items(): api_call += "/%s/%s" % (operator, value) - return __list_setups(api_call=api_call, output_format=output_format) + return __list_flows(api_call=api_call, output_format=output_format) -def __list_setups(api_call, output_format="object"): - """Helper function to parse API calls which are lists of setups""" - xml_string = openml._api_calls._perform_api_call(api_call, "get") - setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",)) - openml_uri = "http://openml.org/openml" - # Minimalistic check if the XML is useful - if "oml:setups" not in setups_dict: - raise ValueError( - 'Error in return XML, does not contain "oml:setups":' " %s" % str(setups_dict) - ) - elif "@xmlns:oml" not in setups_dict["oml:setups"]: - raise ValueError( - "Error in return XML, does not contain " - '"oml:setups"/@xmlns:oml: %s' % str(setups_dict) - ) - elif setups_dict["oml:setups"]["@xmlns:oml"] != openml_uri: - raise ValueError( - "Error in return XML, value of " - '"oml:seyups"/@xmlns:oml is not ' - '"%s": %s' % (openml_uri, str(setups_dict)) - ) +def flow_exists(name: str, external_version: str) -> Union[int, bool]: + """Retrieves the flow id. - assert isinstance(setups_dict["oml:setups"]["oml:setup"], list), type(setups_dict["oml:setups"]) + A flow is uniquely identified by name + external_version. - setups = dict() - for setup_ in setups_dict["oml:setups"]["oml:setup"]: - # making it a dict to give it the right format - current = _create_setup_from_xml( - {"oml:setup_parameters": setup_}, output_format=output_format - ) - if output_format == "object": - setups[current.setup_id] = current - else: - setups[current["setup_id"]] = current + Parameters + ---------- + name : string + Name of the flow + external_version : string + Version information associated with flow. - if output_format == "dataframe": - setups = pd.DataFrame.from_dict(setups, orient="index") + Returns + ------- + flow_exist : int or bool + flow id iff exists, False otherwise - return setups + Notes + ----- + see https://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version + """ + if not (isinstance(name, str) and len(name) > 0): + raise ValueError("Argument 'name' should be a non-empty string") + if not (isinstance(name, str) and len(external_version) > 0): + raise ValueError("Argument 'version' should be a non-empty string") + + xml_response = openml._api_calls._perform_api_call( + "flow/exists", + "post", + data={"name": name, "external_version": external_version}, + ) + result_dict = xmltodict.parse(xml_response) + flow_id = int(result_dict["oml:flow_exists"]["oml:id"]) + return flow_id if flow_id > 0 else False -def initialize_model(setup_id: int) -> Any: - """ - Initialized a model based on a setup_id (i.e., using the exact - same parameter settings) + +def get_flow_id( + model: Optional[Any] = None, + name: Optional[str] = None, + exact_version=True, +) -> Union[int, bool, List[int]]: + """Retrieves the flow id for a model or a flow name. + + Provide either a model or a name to this function. Depending on the input, it does + + * ``model`` and ``exact_version == True``: This helper function first queries for the necessary + extension. Second, it uses that extension to convert the model into a flow. Third, it + executes ``flow_exists`` to potentially obtain the flow id the flow is published to the + server. + * ``model`` and ``exact_version == False``: This helper function first queries for the + necessary extension. Second, it uses that extension to convert the model into a flow. Third + it calls ``list_flows`` and filters the returned values based on the flow name. + * ``name``: Ignores ``exact_version`` and calls ``list_flows``, then filters the returned + values based on the flow name. Parameters ---------- - setup_id : int - The Openml setup_id + model : object + Any model. Must provide either ``model`` or ``name``. + name : str + Name of the flow. Must provide either ``model`` or ``name``. + exact_version : bool + Whether to return the flow id of the exact version or all flow ids where the name + of the flow matches. This is only taken into account for a model where a version number + is available. Returns ------- - model + int or bool, List + flow id iff exists, ``False`` otherwise, List if ``exact_version is False`` """ - setup = get_setup(setup_id) - flow = openml.flows.get_flow(setup.flow_id) - - # instead of using scikit-learns or any other library's "set_params" function, we override the - # OpenMLFlow objects default parameter value so we can utilize the - # Extension.flow_to_model() function to reinitialize the flow with the set defaults. - for hyperparameter in setup.parameters.values(): - structure = flow.get_structure("flow_id") - if len(structure[hyperparameter.flow_id]) > 0: - subflow = flow.get_subflow(structure[hyperparameter.flow_id]) - else: - subflow = flow - subflow.parameters[hyperparameter.parameter_name] = hyperparameter.value + if model is None and name is None: + raise ValueError( + "Need to provide either argument `model` or argument `name`, but both are `None`." + ) + elif model is not None and name is not None: + raise ValueError("Must provide either argument `model` or argument `name`, but not both.") + + if model is not None: + extension = openml.extensions.get_extension_by_model(model, raise_if_no_extension=True) + if extension is None: + # This should never happen and is only here to please mypy will be gone soon once the + # whole function is removed + raise TypeError(extension) + flow = extension.model_to_flow(model) + flow_name = flow.name + external_version = flow.external_version + else: + flow_name = name + exact_version = False - model = flow.extension.flow_to_model(flow) - return model + if exact_version: + return flow_exists(name=flow_name, external_version=external_version) + else: + flows = list_flows(output_format="dataframe") + assert isinstance(flows, pd.DataFrame) # Make mypy happy + flows = flows.query('name == "{}"'.format(flow_name)) + return flows["id"].to_list() -def _to_dict(flow_id: int, openml_parameter_settings) -> OrderedDict: - """Convert a flow ID and a list of OpenML parameter settings to - a dictionary representation that can be serialized to XML. +def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]: + """Retrieve information about flows from OpenML API + and parse it to a dictionary or a Pandas DataFrame. Parameters ---------- - flow_id : int - ID of the flow. - openml_parameter_settings : List[OpenMLParameter] - A list of OpenML parameter settings. - + api_call: str + Retrieves the information about flows. + output_format: str in {"dict", "dataframe"} + The output format. Returns + ------- - OrderedDict - A dictionary representation of the flow ID and parameter settings. + The flows information in the specified output format. """ - # for convenience, this function (ab)uses the run object. - xml: OrderedDict = OrderedDict() - xml["oml:run"] = OrderedDict() - xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" - xml["oml:run"]["oml:flow_id"] = flow_id - xml["oml:run"]["oml:parameter_setting"] = openml_parameter_settings + xml_string = openml._api_calls._perform_api_call(api_call, "get") + flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",)) - return xml + # Minimalistic check if the XML is useful + assert isinstance(flows_dict["oml:flows"]["oml:flow"], list), type(flows_dict["oml:flows"]) + assert flows_dict["oml:flows"]["@xmlns:oml"] == "http://openml.org/openml", flows_dict[ + "oml:flows" + ]["@xmlns:oml"] + + flows = dict() + for flow_ in flows_dict["oml:flows"]["oml:flow"]: + fid = int(flow_["oml:id"]) + flow = { + "id": fid, + "full_name": flow_["oml:full_name"], + "name": flow_["oml:name"], + "version": flow_["oml:version"], + "external_version": flow_["oml:external_version"], + "uploader": flow_["oml:uploader"], + } + flows[fid] = flow + if output_format == "dataframe": + flows = pd.DataFrame.from_dict(flows, orient="index") -def _create_setup_from_xml(result_dict, output_format="object"): - """ - Turns an API xml result into a OpenMLSetup object (or dict) + return flows + + +def _check_flow_for_server_id(flow: OpenMLFlow) -> None: + """Raises a ValueError if the flow or any of its subflows has no flow id.""" + + # Depth-first search to check if all components were uploaded to the + # server before parsing the parameters + stack = list() + stack.append(flow) + while len(stack) > 0: + current = stack.pop() + if current.flow_id is None: + raise ValueError("Flow %s has no flow_id!" % current.name) + else: + for component in current.components.values(): + stack.append(component) + + +def assert_flows_equal( + flow1: OpenMLFlow, + flow2: OpenMLFlow, + ignore_parameter_values_on_older_children: Optional[str] = None, + ignore_parameter_values: bool = False, + ignore_custom_name_if_none: bool = False, + check_description: bool = True, +) -> None: + """Check equality of two flows. + + Two flows are equal if their all keys which are not set by the server + are equal, as well as all their parameters and components. + + Parameters + ---------- + flow1 : OpenMLFlow + + flow2 : OpenMLFlow + + ignore_parameter_values_on_older_children : str (optional) + If set to ``OpenMLFlow.upload_date``, ignores parameters in a child + flow if it's upload date predates the upload date of the parent flow. + + ignore_parameter_values : bool + Whether to ignore parameter values when comparing flows. + + ignore_custom_name_if_none : bool + Whether to ignore the custom name field if either flow has `custom_name` equal to `None`. + + check_description : bool + Whether to ignore matching of flow descriptions. """ - setup_id = int(result_dict["oml:setup_parameters"]["oml:setup_id"]) - flow_id = int(result_dict["oml:setup_parameters"]["oml:flow_id"]) - parameters = {} - if "oml:parameter" not in result_dict["oml:setup_parameters"]: - parameters = None - else: - # basically all others - xml_parameters = result_dict["oml:setup_parameters"]["oml:parameter"] - if isinstance(xml_parameters, dict): - id = int(xml_parameters["oml:id"]) - parameters[id] = _create_setup_parameter_from_xml( - result_dict=xml_parameters, output_format=output_format - ) - elif isinstance(xml_parameters, list): - for xml_parameter in xml_parameters: - id = int(xml_parameter["oml:id"]) - parameters[id] = _create_setup_parameter_from_xml( - result_dict=xml_parameter, output_format=output_format + if not isinstance(flow1, OpenMLFlow): + raise TypeError("Argument 1 must be of type OpenMLFlow, but is %s" % type(flow1)) + + if not isinstance(flow2, OpenMLFlow): + raise TypeError("Argument 2 must be of type OpenMLFlow, but is %s" % type(flow2)) + + # TODO as they are actually now saved during publish, it might be good to + # check for the equality of these as well. + generated_by_the_server = [ + "flow_id", + "uploader", + "version", + "upload_date", + # Tags aren't directly created by the server, + # but the uploader has no control over them! + "tags", + ] + ignored_by_python_api = ["binary_url", "binary_format", "binary_md5", "model", "_entity_id"] + + for key in set(flow1.__dict__.keys()).union(flow2.__dict__.keys()): + if key in generated_by_the_server + ignored_by_python_api: + continue + attr1 = getattr(flow1, key, None) + attr2 = getattr(flow2, key, None) + if key == "components": + if not (isinstance(attr1, Dict) and isinstance(attr2, Dict)): + raise TypeError("Cannot compare components because they are not dictionary.") + + for name in set(attr1.keys()).union(attr2.keys()): + if name not in attr1: + raise ValueError( + "Component %s only available in " "argument2, but not in argument1." % name + ) + if name not in attr2: + raise ValueError( + "Component %s only available in " "argument2, but not in argument1." % name + ) + assert_flows_equal( + attr1[name], + attr2[name], + ignore_parameter_values_on_older_children, + ignore_parameter_values, + ignore_custom_name_if_none, ) + elif key == "_extension": + continue + elif check_description and key == "description": + # to ignore matching of descriptions since sklearn based flows may have + # altering docstrings and is not guaranteed to be consistent + continue else: - raise ValueError( - "Expected None, list or dict, received " - "something else: %s" % str(type(xml_parameters)) - ) + if key == "parameters": + if ignore_parameter_values or ignore_parameter_values_on_older_children: + params_flow_1 = set(flow1.parameters.keys()) + params_flow_2 = set(flow2.parameters.keys()) + symmetric_difference = params_flow_1 ^ params_flow_2 + if len(symmetric_difference) > 0: + raise ValueError( + "Flow %s: parameter set of flow " + "differs from the parameters stored " + "on the server." % flow1.name + ) + + if ignore_parameter_values_on_older_children: + upload_date_current_flow = dateutil.parser.parse(flow1.upload_date) + upload_date_parent_flow = dateutil.parser.parse( + ignore_parameter_values_on_older_children + ) + if upload_date_current_flow < upload_date_parent_flow: + continue + + if ignore_parameter_values: + # Continue needs to be done here as the first if + # statement triggers in both special cases + continue + elif ( + key == "custom_name" + and ignore_custom_name_if_none + and (attr1 is None or attr2 is None) + ): + # If specified, we allow `custom_name` inequality if one flow's name is None. + # Helps with backwards compatibility as `custom_name` is now auto-generated, but + # before it used to be `None`. + continue + elif key == "parameters_meta_info": + # this value is a dictionary where each key is a parameter name, containing another + # dictionary with keys specifying the parameter's 'description' and 'data_type' + # checking parameter descriptions can be ignored since that might change + # data type check can also be ignored if one of them is not defined, i.e., None + params1 = set(flow1.parameters_meta_info) + params2 = set(flow2.parameters_meta_info) + if params1 != params2: + raise ValueError( + "Parameter list in meta info for parameters differ " "in the two flows." + ) + # iterating over the parameter's meta info list + for param in params1: + if ( + isinstance(flow1.parameters_meta_info[param], Dict) + and isinstance(flow2.parameters_meta_info[param], Dict) + and "data_type" in flow1.parameters_meta_info[param] + and "data_type" in flow2.parameters_meta_info[param] + ): + value1 = flow1.parameters_meta_info[param]["data_type"] + value2 = flow2.parameters_meta_info[param]["data_type"] + else: + value1 = flow1.parameters_meta_info[param] + value2 = flow2.parameters_meta_info[param] + if value1 is None or value2 is None: + continue + elif value1 != value2: + raise ValueError( + "Flow {}: data type for parameter {} in {} differ " + "as {}\nvs\n{}".format(flow1.name, param, key, value1, value2) + ) + # the continue is to avoid the 'attr != attr2' check at end of function + continue + + if attr1 != attr2: + raise ValueError( + "Flow %s: values for attribute '%s' differ: " + "'%s'\nvs\n'%s'." % (str(flow1.name), str(key), str(attr1), str(attr2)) + ) + - if output_format in ["dataframe", "dict"]: - return_dict = {"setup_id": setup_id, "flow_id": flow_id} - return_dict["parameters"] = parameters - return return_dict - return OpenMLSetup(setup_id, flow_id, parameters) +def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow: + """Create flow object from xml + Parameters + ---------- + flow_xml: xml representation of a flow -def _create_setup_parameter_from_xml(result_dict, output_format="object"): + Returns + ------- + OpenMLFlow """ - Create an OpenMLParameter object or a dictionary from an API xml result. + + return OpenMLFlow._from_dict(xmltodict.parse(flow_xml)) + + +def delete_flow(flow_id: int) -> bool: + """Delete flow with id `flow_id` from the OpenML server. + + You can only delete flows which you uploaded and which + which are not linked to runs. + + Parameters + ---------- + flow_id : int + OpenML id of the flow + + Returns + ------- + bool + True if the deletion was successful. False otherwise. """ - if output_format == "object": - return OpenMLParameter( - input_id=int(result_dict["oml:id"]), - flow_id=int(result_dict["oml:flow_id"]), - flow_name=result_dict["oml:flow_name"], - full_name=result_dict["oml:full_name"], - parameter_name=result_dict["oml:parameter_name"], - data_type=result_dict["oml:data_type"], - default_value=result_dict["oml:default_value"], - value=result_dict["oml:value"], - ) - else: - return { - "input_id": int(result_dict["oml:id"]), - "flow_id": int(result_dict["oml:flow_id"]), - "flow_name": result_dict["oml:flow_name"], - "full_name": result_dict["oml:full_name"], - "parameter_name": result_dict["oml:parameter_name"], - "data_type": result_dict["oml:data_type"], - "default_value": result_dict["oml:default_value"], - "value": result_dict["oml:value"], - } + return openml.utils._delete_entity("flow", flow_id) From ab6f2028ebc7498ab9d8e1bd680e02c8b4845cfd Mon Sep 17 00:00:00 2001 From: Lennart Purucker Date: Mon, 30 Oct 2023 14:50:49 -0700 Subject: [PATCH 47/48] Update functions.py of setup to fix mypy error --- openml/setups/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 410a1e964..bc6d21aaa 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -287,7 +287,7 @@ def initialize_model(setup_id: int) -> Any: return model -def _to_dict(flow_id: int, openml_parameter_settings): +def _to_dict(flow_id: int, openml_parameter_settings) -> OrderedDict: """Convert a flow ID and a list of OpenML parameter settings to a dictionary representation that can be serialized to XML. @@ -304,7 +304,7 @@ def _to_dict(flow_id: int, openml_parameter_settings): A dictionary representation of the flow ID and parameter settings. """ # for convenience, this function (ab)uses the run object. - xml = OrderedDict() + xml: OrderedDict = OrderedDict() xml["oml:run"] = OrderedDict() xml["oml:run"]["@xmlns:oml"] = "http://openml.org/openml" xml["oml:run"]["oml:flow_id"] = flow_id From 049230ddb622194fde2c81e4de820a2e7179b799 Mon Sep 17 00:00:00 2001 From: Vishal Parmar Date: Wed, 1 Nov 2023 00:02:38 +0530 Subject: [PATCH 48/48] Update progress.rst --- doc/progress.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/progress.rst b/doc/progress.rst index 3fc493914..6fed41326 100644 --- a/doc/progress.rst +++ b/doc/progress.rst @@ -10,6 +10,7 @@ next ~~~~~~ * MAINT #1280: Use the server-provided ``parquet_url`` instead of ``minio_url`` to determine the location of the parquet file. + * ADD #716: add documentation for remaining attributes of classes and functions. 0.14.1 ~~~~~~