Source code for elliot.namespace.namespace_model

"""
Module description:

"""

__version__ = '0.3.1'
__author__ = 'Vito Walter Anelli, Claudio Pomo'
__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it'

import copy
import os
import re
from ast import literal_eval
from collections import OrderedDict
from functools import reduce
from os.path import isfile, join
from types import SimpleNamespace

from hyperopt import hp
from yaml import FullLoader as FullLoader
from yaml import load

import elliot.hyperoptimization as ho
from elliot.utils.folder import manage_directories

regexp = re.compile(r'[\D][\w-]+\.[\w-]+')

_experiment = 'experiment'

_version = 'version'
_data_config = "data_config"
_splitting = "splitting"
_evaluation = "evaluation"
_prefiltering = "prefiltering"
_binarize = "binarize"
_negative_sampling = "negative_sampling"
_dataset = 'dataset'
_dataloader = 'dataloader'
_weights = 'path_output_rec_weight'
_performance = 'path_output_rec_performance'
_logger_config = 'path_logger_config'
_log_folder = 'path_log_folder'
_verbose = 'verbose'
_recs = 'path_output_rec_result'
_top_k = 'top_k'
_config_test = 'config_test'
_print_triplets = 'print_results_as_triplets'
_metrics = 'metrics'
_relevance_threshold = 'relevance_threshold'
_paired_ttest = 'paired_ttest'
_wilcoxon_test = 'wilcoxon_test'
_models = 'models'
_recommender = 'recommender'
_gpu = 'gpu'
_external_models_path = 'external_models_path'
_hyper_max_evals = 'hyper_max_evals'
_hyper_opt_alg = 'hyper_opt_alg'
_data_paths = 'data_paths'
_meta = 'meta'
_random_seed = 'random_seed'
_align_side_with_train = "align_side_with_train"


[docs]class NameSpaceModel:
    def __init__(self, config_path, base_folder_path_elliot, base_folder_path_config):
        self.base_namespace = SimpleNamespace()

        self._base_folder_path_elliot = base_folder_path_elliot
        self._base_folder_path_config = base_folder_path_config

        self.config_file = open(config_path)
        self.config = load(self.config_file, Loader=FullLoader)

        os.environ['CUDA_VISIBLE_DEVICES'] = str(self.config[_experiment].get(_gpu, -1))

    @staticmethod
    def _set_path(config_path, local_path):
        if os.path.isabs(local_path):
            return os.path.abspath(local_path)
        else:
            if local_path.startswith((".", "..")) or regexp.search(local_path):
                # return f"{config_path}/{local_path}"
                return os.path.abspath(os.sep.join([config_path, local_path]))
            else:
                # the string is an attribute but not a path
                return local_path

    @staticmethod
    def _safe_set_path(config_path, raw_local_path, dataset_name):
        if isinstance(raw_local_path, str):
            local_path = raw_local_path.format(dataset_name)
            if os.path.isabs(local_path):
                return os.path.abspath(local_path)
            else:
                if local_path.startswith((".", "..")) or regexp.search(local_path):
                    return os.path.abspath(os.sep.join([config_path, local_path]))
                else:
                    # the string is an attribute but not a path
                    return local_path
        else:
            return raw_local_path

[docs]    def fill_base(self):

        # for path in self.config[_experiment][_data_paths].keys():
        #     self.config[_experiment][_data_paths][path] = \
        #         self.config[_experiment][_data_paths][path].format(self.config[_experiment][_dataset])
        default_results_recs = os.sep.join(["..", "results", "{0}", "recs"])
        default_results_weights = os.sep.join(["..", "results", "{0}", "weights"])
        default_results_performance = os.sep.join(["..", "results", "{0}", "performance"])
        self.config[_experiment][_recs] = os.path.abspath(self.config[_experiment]\
            .get(_recs, self._set_path(self._base_folder_path_config, default_results_recs))\
            .format(self.config[_experiment][_dataset]))
        self.config[_experiment][_weights] = os.path.abspath(self.config[_experiment]\
            .get(_weights, self._set_path(self._base_folder_path_config, default_results_weights)) \
            .format(self.config[_experiment][_dataset]))
        self.config[_experiment][_performance] = os.path.abspath(self.config[_experiment]\
            .get(_performance, self._set_path(self._base_folder_path_config, default_results_performance)) \
            .format(self.config[_experiment][_dataset]))

        self.config[_experiment][_dataloader] = self.config[_experiment].get(_dataloader, "DataSetLoader")
        self.config[_experiment][_version] = self.config[_experiment].get(_version, __version__)


        manage_directories(self.config[_experiment][_recs], self.config[_experiment][_weights],
                           self.config[_experiment][_performance])

        for p in [_data_config, _weights, _recs, _dataset, _top_k, _performance, _logger_config,
                  _log_folder, _dataloader, _splitting, _prefiltering, _evaluation, _external_models_path,
                  _print_triplets, _config_test, _negative_sampling, _binarize, _random_seed, _align_side_with_train,
                  _version]:
            if p == _data_config:
                side_information = self.config[_experiment][p].get("side_information", None)

                if side_information:
                    if isinstance(side_information, list):
                        side_information = [SimpleNamespace(**{k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                                 for k, v in side.items()}) for side in side_information]
                        self.config[_experiment][p].update({k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                                            for k, v in self.config[_experiment][p].items()})
                        self.config[_experiment][p]["side_information"] = side_information
                        self.config[_experiment][p][_dataloader] = "DataSetLoader"
                        setattr(self.base_namespace, p, SimpleNamespace(**self.config[_experiment][p]))
                    elif isinstance(side_information, dict):
                        side_information = self.config[_experiment][p].get("side_information", {})
                        side_information.update({k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                                 for k, v in side_information.items()})
                        side_information = SimpleNamespace(**side_information)
                        self.config[_experiment][p].update({k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                                            for k, v in self.config[_experiment][p].items()})
                        self.config[_experiment][p]["side_information"] = side_information
                        self.config[_experiment][p][_dataloader] = self.config[_experiment][p].get(_dataloader,
                                                                                                   "DataSetLoader")
                        setattr(self.base_namespace, p, SimpleNamespace(**self.config[_experiment][p]))
                    else:
                        raise Exception("Side information is neither a list nor a dict. No other options are allowed.")
                else:
                    self.config[_experiment][p]["side_information"] = []
                    self.config[_experiment][p][_dataloader] = self.config[_experiment][p].get(_dataloader,
                                                                                               "DataSetLoader")
                    self.config[_experiment][p].update(
                        {k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                         for k, v in self.config[_experiment][p].items()})
                    setattr(self.base_namespace, p, SimpleNamespace(**self.config[_experiment][p]))

            elif p == _splitting and self.config[_experiment].get(p, {}):
                self.config[_experiment][p].update({k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                                    for k, v in self.config[_experiment][p].items()})
                test_splitting = self.config[_experiment][p].get("test_splitting", {})
                validation_splitting = self.config[_experiment][p].get("validation_splitting", {})

                if test_splitting:
                    test_splitting = SimpleNamespace(**test_splitting)
                    self.config[_experiment][p]["test_splitting"] = test_splitting

                if validation_splitting:
                    validation_splitting = SimpleNamespace(**validation_splitting)
                    self.config[_experiment][p]["validation_splitting"] = validation_splitting

                setattr(self.base_namespace, p, SimpleNamespace(**self.config[_experiment][p]))
            elif p == _prefiltering and self.config[_experiment].get(p, {}):

                if not isinstance(self.config[_experiment][p], list):
                    self.config[_experiment][p] = [self.config[_experiment][p]]

                preprocessing_strategies = [SimpleNamespace(**strategy) for strategy in self.config[_experiment][p]]
                self.config[_experiment][p] = preprocessing_strategies
                setattr(self.base_namespace, p, self.config[_experiment][p])

            elif p == _negative_sampling and self.config[_experiment].get(p, {}):
                self.config[_experiment][p].update({k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                                    for k, v in self.config[_experiment][p].items()})
                self.config[_experiment][p] = SimpleNamespace(**self.config[_experiment][p])
                if getattr(self.config[_experiment][p], 'strategy', '') == 'random':
                    path = os.path.abspath(os.sep.join([self._base_folder_path_config, "..", "data",
                                                         self.config[_experiment][_dataset], "negative.tsv"]))
                    setattr(self.config[_experiment][p], 'file_path', path)
                setattr(self.base_namespace, p, self.config[_experiment][p])
            elif p == _evaluation and self.config[_experiment].get(p, {}):
                complex_metrics = self.config[_experiment][p].get("complex_metrics", {})
                paired_ttest = self.config[_experiment][p].get("paired_ttest", {})
                wilcoxon_test = self.config[_experiment][p].get("wilcoxon_test", {})
                for complex_metric in complex_metrics:
                    complex_metric.update({k: self._safe_set_path(self._base_folder_path_config, v, self.config[_experiment][_dataset])
                                           for k, v in complex_metric.items()})
                    # complex_metric.update({k: self._set_path(self._base_folder_path_config,
                    #                                   v.format(self.config[_experiment][_dataset]))
                    #                 for k, v in complex_metric.items() if isinstance(v, str)})
                self.config[_experiment][p]["complex_metrics"] = complex_metrics
                self.config[_experiment][p]["paired_ttest"] = paired_ttest
                self.config[_experiment][p]["wilcoxon_test"] = wilcoxon_test
                setattr(self.base_namespace, p, SimpleNamespace(**self.config[_experiment][p]))
            elif p == _logger_config:
                if not self.config[_experiment].get(p, False):
                    setattr(self.base_namespace, p, os.path.abspath(os.sep.join([self._base_folder_path_elliot, "config", "logger_config.yml"])))
                else:
                    setattr(self.base_namespace, p,
                            self._safe_set_path(self._base_folder_path_config, self.config[_experiment][p], self.config[_experiment][_dataset]))
                # setattr(self.base_namespace, p, f"{self._base_folder_path_elliot}/config/logger_config.yml")
            elif p == _log_folder:
                if not self.config[_experiment].get(p, False):
                    setattr(self.base_namespace, p, os.path.abspath(os.sep.join([self._base_folder_path_elliot, "..", "log"])))
                else:
                    setattr(self.base_namespace, p,
                            self._safe_set_path(self._base_folder_path_config, self.config[_experiment][p], self.config[_experiment][_dataset]))

                # setattr(self.base_namespace, p, f"{self._base_folder_path_elliot}/../log/")
            elif p == _external_models_path and self.config[_experiment].get(p, False):
                self.config[_experiment][p] = self._safe_set_path(self._base_folder_path_config, self.config[_experiment][p], "")
                setattr(self.base_namespace, p, self.config[_experiment][p])
            elif p == _config_test:
                setattr(self.base_namespace, p, self.config[_experiment].get(p, False))
            elif p == _random_seed:
                setattr(self.base_namespace, p, self.config[_experiment].get(p, 42))
            elif p == _binarize:
                setattr(self.base_namespace, p, self.config[_experiment].get(p, False))
            elif p == _align_side_with_train:
                setattr(self.base_namespace, p, self.config[_experiment].get(p, True))
            else:
                if self.config[_experiment].get(p):
                    setattr(self.base_namespace, p, self.config[_experiment][p])

[docs]    def fill_model(self):
        for key in self.config[_experiment][_models]:
            meta_model = self.config[_experiment][_models][key].get(_meta, {})
            model_name_space = SimpleNamespace(**self.config[_experiment][_models][key])
            setattr(model_name_space, _meta, SimpleNamespace(**meta_model))
            if any(isinstance(value, list) for value in self.config[_experiment][_models][key].values()):
                space_list = []
                for k, value in self.config[_experiment][_models][key].items():
                    if isinstance(value, list):
                        valid_functions = ["choice",
                             "randint",
                             "uniform",
                             "quniform",
                             "loguniform",
                             "qloguniform",
                             "normal",
                             "qnormal",
                             "lognormal",
                             "qlognormal"
                             ]
                        if isinstance(value[0], str) and value[0] in valid_functions:
                            func_ = getattr(hp, value[0].replace(" ","").split("(")[0])
                            val_string = value[0].replace(" ", "").split("(")[1].split(")")[0] \
                                if len(value[0].replace(" ", "").split("(")) > 1 else None
                            val = [literal_eval(val_string) if val_string else None]
                            val.extend([literal_eval(val.replace(" ", "").replace(")", "")) if isinstance(val, str) else
                                        val for val in value[1:]])
                            val = [v for v in val if v is not None]
                            space_list.append((k, func_(k, *val)))
                        elif all(isinstance(item, str) for item in value):
                            space_list.append((k, hp.choice(k, value)))
                        else:
                            space_list.append((k, hp.choice(k, literal_eval(
                                "[" + str(",".join([str(v) for v in value])) + "]")
                                                            )))
                _SPACE = OrderedDict(space_list)
                _estimated_evals = reduce(lambda x, y: x*y, [len(param.pos_args) - 1 for _, param in _SPACE.items()], 1)
                _max_evals = meta_model.get(_hyper_max_evals, _estimated_evals)
                if _max_evals <= 0:
                    raise Exception("Only pure value lists can be used without hyper_max_evals option. Please define hyper_max_evals in model/meta configuration.")
                _opt_alg = ho.parse_algorithms(meta_model.get(_hyper_opt_alg, "grid"))
                yield key, (model_name_space, _SPACE, _max_evals, _opt_alg)
            else:
                if key == "RecommendationFolder":
                    folder_path = getattr(model_name_space, "folder", None)
                    if folder_path:
                        onlyfiles = [f for f in os.listdir(folder_path) if isfile(join(folder_path, f))]
                        for file_ in onlyfiles:
                            local_model_name_space = copy.copy(model_name_space)
                            local_model_name_space.path = os.path.join(folder_path, file_)
                            yield "ProxyRecommender", local_model_name_space
                    else:
                        raise Exception("RecommendationFolder meta-model must expose the folder field.")
                else:
                    yield key, model_name_space