Reference

Main method

SolrCMF

Bases: ADMM

Sparse orthgonal low-rank Collective Matrix Factorization

Implements sparse orthogonal low-rank Collective Matrix Factorization (solrCMF).

Source code in solrcmf/solrcmf.py

class SolrCMF(ADMM):
    """Sparse orthgonal low-rank Collective Matrix Factorization

    Implements sparse orthogonal low-rank Collective Matrix Factorization
    (solrCMF).
    """

    _parameter_constraints = {
        **ADMM._parameter_constraints,
        "structure_penalty": [Interval(Real, 0, None, closed="left"), None],
        "max_rank": [Interval(Integral, 1, None, closed="left"), None],
        "factor_penalty": [Interval(Real, 0, None, closed="neither"), None],
        "factor_pruning": ["boolean"],
        "init": [StrOptions({"random", "custom"})],
        "init_kwargs": [dict, None],
        "rho": [Interval(Real, 0.0, None, closed="neither"), None],
        "alpha": [Interval(Real, 0.0, None, closed="left"), None],
        "mu": [Interval(Real, 0.0, None, closed="neither"), None],
    }

    def __init__(
        self,
        *,
        structure_penalty: float | None = None,
        max_rank: int | None = None,
        factor_penalty: float | None = None,
        factor_pruning: bool = True,
        init: str = "random",
        init_kwargs: dict[str, Any] | None = None,
        rho: float | None = None,
        alpha: float | None = None,
        mu: float | None = None,
        max_iter: int = 1000,
        abs_tol: float = 1e-6,
        rel_tol: float = 1e-6,
        save_ctx: bool = False,
    ):
        super().__init__(
            max_iter=max_iter,
            abs_tol=abs_tol,
            rel_tol=rel_tol,
            save_ctx=save_ctx,
        )

        self.structure_penalty = structure_penalty
        self.max_rank = max_rank
        self.factor_penalty = factor_penalty
        self.factor_pruning = factor_pruning
        self.init = init
        self.init_kwargs = init_kwargs
        self.rho = rho
        self.alpha = alpha
        self.mu = mu

    def _setup(
        self,
        X: dict[ViewDesc, NDArray[float64]],
        *,
        indices: dict[ViewDesc, NDArray[intp]] | None = None,
        structure_weights: (
            dict[ViewDesc, NDArray[float64] | float64] | None
        ) = None,
        structure_pattern: dict[ViewDesc, NDArray[bool_]] | None = None,
        factor_weights: dict[Hashable, NDArray[float64] | float64]
        | None = None,
        factor_pattern: dict[Hashable, NDArray[bool_]] | None = None,
        vs: dict[Hashable, NDArray[float64]] | None = None,
        ds: dict[ViewDesc, NDArray[float64]] | None = None,
        us: dict[Hashable, NDArray[float64]] | None = None,
    ):
        # A context will be populated throughout setup
        ctx = Context()

        assert isinstance(
            X, dict
        ), "'X' needs to be a dictionary of data matrices"

        for x in X.values():
            x = check_array(x, force_all_finite="allow-nan")

        layout = X.keys()
        # The first two tuple indices indicate the views.
        # The rest are arbitrary to make the indices different if the
        # same views appear.
        views = set([k[i] for k in layout for i in range(2)])
        viewdims_set = set(
            [(k[i], x.shape[i]) for k, x in X.items() for i in range(2)]
        )
        assert len(views) == len(viewdims_set), (
            "Views do not have consistent dimensions across layout. Received"
            f" matrices with dimensions {viewdims_set}."
        )
        viewdims = dict(viewdims_set)

        ctx.data = X

        assert bool(
            self.structure_penalty is None and self.max_rank is None
        ) ^ bool(structure_pattern is None), (
            "Either both structure_penalty and max_rank, or "
            " structure_pattern need to be provided. The respective"
            " other(s) need to be None."
        )

        assert self.factor_penalty is None or factor_pattern is None, (
            "One or both of `factor_penalty` and `factor_pattern`"
            " need to be None."
        )

        if self.structure_penalty is not None:
            ctx.params["structure_penalty"] = self.structure_penalty
            max_rank = self.max_rank

            if structure_weights is None:
                ctx.params["structure_weights"] = {k: 1.0 for k in layout}
            else:
                # TODO: Add argument check
                ctx.params["structure_weights"] = structure_weights

            ctx.params["fixed_structure_pattern"] = False
        else:
            assert (
                not self.factor_pruning
            ), "Set 'factor_pruning' to False to use 'structure_pattern'"
            assert structure_pattern.keys() == X.keys(), (
                "'structure_pattern' must contain one pattern for each data"
                f" matrix. Expected: {X.keys()}, observed:"
                f" {structure_pattern.keys()}"
            )

            rks = set(p.shape[0] for p in structure_pattern.values())
            assert len(rks) == 1, (
                "All patterns in 'structure_pattern' should have the same"
                f" length. Observed lengths: {rks}"
            )
            # Extract the only element
            max_rank = list(rks)[0]

            ctx.params["structure_pattern"] = structure_pattern
            ctx.params["fixed_structure_pattern"] = True

        ctx.params["max_rank"] = max_rank

        for v, p in viewdims.items():
            assert p >= max_rank, (
                f"View {v} has dimension {p} which is less than the maximum"
                f" requested rank {max_rank}"
            )

        if self.factor_penalty is not None:
            if self.mu is None:
                ctx.params["mu"] = 10.0
            else:
                ctx.params["mu"] = self.mu

            # assert self.mu is not None, (
            #     f"mu needs to be provided in {self.__class__.__name__} when"
            #     " factor_penalty is not None"
            # )
            # ctx.params["mu"] = self.mu

            ctx.params["factor_penalty"] = self.factor_penalty
            ctx.params["factor_sparsity"] = True

            if factor_weights is None:
                ctx.params["factor_weights"] = {
                    k: 1.0 / sqrt(p) for k, p in viewdims.items()
                }
            else:
                # TODO: Add argument check
                ctx.params["factor_weights"] = factor_weights
        else:
            ctx.params["factor_sparsity"] = False

        if factor_pattern is not None:
            assert (
                not self.factor_pruning
            ), "Set 'factor_pruning' to False to use 'factor_pattern'"

            if self.mu is None:
                ctx.params["mu"] = 10.0
            else:
                ctx.params["mu"] = self.mu

            # assert self.mu is not None, (
            #     f"mu needs to be provided in {self.__class__.__name__} when"
            #     " factor_pattern is not None"
            # )
            # ctx.params["mu"] = self.mu

            # Check factor pattern's correctness
            assert factor_pattern.keys() == viewdims.keys(), (
                "'factor_pattern' needs to contain a pattern for each view."
                f" Views = {viewdims.keys()}, Patterns available for views ="
                f" {factor_pattern.keys()}"
            )
            dims = {k: p.shape[0] for k, p in factor_pattern.items()}
            assert dims == viewdims, (
                f"View dimensions in 'factor_pattern' ({dims}) do not agree"
                f" with view dimensions in data ({viewdims})."
            )
            rks = set(p.shape[1] for p in factor_pattern.values())
            assert len(rks) == 1, (
                "The patterns in 'factor_pattern' need to have the same"
                f" number of columns. Observed sizes = {rks}"
            )
            # Extract the only element
            rk = list(rks)[0]
            assert rk == max_rank, (
                "Number of columns in 'factor_pattern' needs to match"
                " 'max_rank' or number of elements in each"
                f" 'structure_pattern'. Expected: {max_rank}, observed:"
                f" {rk}"
            )

            ctx.params["factor_pattern"] = factor_pattern
            ctx.params["fixed_factor_pattern"] = True
        else:
            ctx.params["fixed_factor_pattern"] = False

        if ctx.params["factor_sparsity"] or ctx.params["fixed_factor_pattern"]:
            ctx.params["vp_weights"] = {
                k: 1.0 / sqrt(p) for k, p in viewdims.items()
            }
            max_vp_w = max([w for w in ctx.params["vp_weights"].values()])
            min_vp_w = min([w for w in ctx.params["vp_weights"].values()])

            rho_lb = _rho_lower_bound(ctx.params["mu"], min_vp_w, max_vp_w)
        else:
            rho_lb = _rho_lower_bound()

        if self.rho is None:
            rho = rho_lb + 0.1
        else:
            rho = self.rho

        assert rho > rho_lb, (
            f"rho needs to be greater than {rho_lb} in"
            f" {self.__class__.__name__}; now it is {rho}"
        )

        if ctx.params["factor_sparsity"]:
            u_edge_cases = {
                k: self.factor_penalty * w * sqrt(viewdims[k]) / rho
                for k, w in ctx.params["factor_weights"].items()
            }
            if any([u >= 1 for u in u_edge_cases.values()]):
                warn(
                    "For numerical stability, factor_penalty * weight[k] *"
                    " sqrt(viewdims[k]) / rho < 1 should hold for all views k."
                    f" Here: {u_edge_cases}"
                )

        ctx.params["rho"] = rho
        if self.alpha is None:
            ctx.params["alpha"] = 1e-3 * ctx.params["rho"]
        else:
            ctx.params["alpha"] = self.alpha

        # print(f"alpha = {ctx.params['alpha']}")

        ctx.params["factor_pruning"] = self.factor_pruning

        ctx.params["vidx_ridx"] = {
            (v,): [(k, (k[0],)) for k in layout if k[1] == v] for v in views
        }
        ctx.params["vidx_cidx"] = {
            (v,): [(k, (k[1],)) for k in layout if k[0] == v] for v in views
        }

        # Set up ADMM blocks and constraints
        for v in views:
            ctx.add_block("v", (v,), VBlock, (viewdims[v], max_rank))
        for k in layout:
            ctx.add_block("d", k, DBlock, (max_rank,))

            if self.factor_pruning:
                ctx.blocks["d"][k].active_factors = ones(
                    (max_rank,), dtype=bool_
                )

        if self.factor_penalty is not None or factor_pattern is not None:
            for v in views:
                ctx.add_block("u", (v,), UBlock, (viewdims[v], max_rank))

            # Important to keep vp blocks just before z blocks
            for v in views:
                ctx.add_block("vp", (v,), VpBlock, (viewdims[v], max_rank))
                ctx.add_constraint(
                    "factor",
                    (v,),
                    FactorConstraint,
                    (viewdims[v], max_rank),
                )

        # Important to keep z blocks last
        for k in layout:
            ctx.add_block("z", k, ZBlock, (viewdims[k[0]], viewdims[k[1]]))
            ctx.add_constraint(
                "mean_structure",
                k,
                MeanStructureConstraint,
                (viewdims[k[0]], viewdims[k[1]]),
            )

        if self.init_kwargs is None:
            init_kwargs = {}
        else:
            init_kwargs = self.init_kwargs

        # Initialize blocks and constraints
        if self.init == "random":
            init_fn = RandomInitializer(**init_kwargs)
        elif self.init == "custom":
            assert vs is not None and ds is not None, (
                f"If 'init' is \"custom\" in {self.__class__.__name__} then"
                " 'vs' and 'ds' need to be provided to method 'fit' as"
                " keyword arguments."
            )
            if (
                ctx.params["factor_sparsity"]
                or ctx.params["fixed_factor_pattern"]
            ):
                assert us is not None, (
                    "If 'init' is \"custom\" in"
                    f" {self.__class__.__name__} then 'us' needs to be"
                    " provided to method 'fit' as a keyword argument."
                )

            init_fn = FromFormerInitializer(
                vs=vs,
                ds=ds,
                us=us,
                **init_kwargs,
            )

        # Call initializer
        init_fn(ctx)

        # Remove nan entries from `flat_indices` if indices provided.
        # Otherwise have non-nan indices as `flat_indices`
        ctx.params["flat_indices"] = {}
        for k, x in ctx.data.items():
            if indices is None:
                ctx.params["flat_indices"][k] = flatnonzero(
                    logical_not(isnan(x))
                )
            else:
                indices_mask = zeros(x.size, dtype=bool_)
                indices_mask[indices[k]] = True
                not_nan_mask = logical_not(isnan(x)).ravel()
                ctx.params["flat_indices"][k] = flatnonzero(
                    logical_and(indices_mask, not_nan_mask)
                )

        return ctx

    def transform(
        self,
        X: dict[ViewDesc, NDArray[float64]],
        y=None,
    ):
        check_is_fitted(self)

        return {
            k: self.vs_[k[0]] @ diag(d) @ self.vs_[k[1]].T
            for k, d in self.ds_.items()
        }

    def score(
        self,
        X: dict[ViewDesc, NDArray[float64]],
        *,
        indices: dict[ViewDesc, NDArray[intp]] | None = None,
    ):
        check_is_fitted(self)

        return neg_mean_squared_error(X, self.transform(X), indices=indices)

    def structure_pattern(self):
        check_is_fitted(self)

        return {k: d != 0.0 for k, d in self.ds_.items()}

    def factor_pattern(self):
        check_is_fitted(self)

        if hasattr(self, "us_"):
            return {k: u != 0.0 for k, u in self.us_.items()}
        else:
            None

    def _extra_attrs(self, ctx: Context):
        out = {}
        out["vs_"] = {k[0]: b.value for k, b in ctx.blocks["v"].items()}
        out["ds_"] = {k: b.value for k, b in ctx.blocks["d"].items()}
        if ctx.params["factor_sparsity"] or ctx.params["fixed_factor_pattern"]:
            out["us_"] = {k[0]: b.value for k, b in ctx.blocks["u"].items()}

        out["est_max_rank_"] = sum(
            vstack([d != 0.0 for d in out["ds_"].values()]).sum(0) != 0
        )

        return out

    def _more_tags(self):
        return {
            "X_types": "dict",
        }

Hyperparameter selection

ElementwiseFolds

Bases: BaseSplitter

Source code in solrcmf/splits.py

class ElementwiseFolds(BaseSplitter):
    def __init__(
        self,
        n_splits: int,
        *,
        shuffle: bool = True,
        rng: Generator | None = None,
    ):
        if n_splits <= 1:
            raise ValueError("n_splits needs to be an integer >= 2")
        self.n_splits = n_splits

        if shuffle is False and rng is not None:
            raise ValueError("rng should be None if shuffle is False")
        self.shuffle = shuffle

        if rng is None:
            rng = default_rng()

        self.rng = rng

    # Take some inspiration from sklearn
    def _iter_test_indices(self, xs: dict[ViewDesc, NDArray[float64]]):
        # Exclude entries that are already nan
        indices = {
            k: arange(x.size)[flatnonzero(logical_not(isnan(x)))]
            for k, x in xs.items()
        }
        if self.shuffle:
            for idx in indices.values():
                self.rng.shuffle(idx)

        fold_sizes = {
            k: full(self.n_splits, idx.size // self.n_splits, dtype=int_)
            for k, idx in indices.items()
        }
        for k, s in fold_sizes.items():
            s[: indices[k].size % self.n_splits] += 1

        current = {k: 0 for k in fold_sizes.keys()}
        for i in range(self.n_splits):
            test_indices = {
                k: idx[current[k] : current[k] + fold_sizes[k][i]]
                for k, idx in indices.items()
            }
            yield test_indices
            current = {k: idx + fold_sizes[k][i] for k, idx in current.items()}

    def get_n_splits(self, xs: dict[ViewDesc, NDArray[float64]]):
        return self.n_splits

SolrCMFCV

Bases: BaseEstimator

Source code in solrcmf/crossval.py

class SolrCMFCV(BaseEstimator):
    _parameter_constraints = {
        "structure_penalty": [
            Interval(Real, 0, None, closed="left"),
            "array-like",
        ],
        "max_rank": [Interval(Integral, 1, None, closed="left"), "array-like"],
        "factor_penalty": [
            Interval(Real, 0, None, closed="neither"),
            "array-like",
            None,
        ],
        "factor_pruning": ["boolean"],
        "cv": [Interval(Integral, 2, None, closed="left"), BaseSplitter],
        "cv_strategy": [
            StrOptions({"structure_first_debiased_cv", "penalized_cv"})
        ],
        "score": [
            StrOptions(
                {
                    "neg_mean_squared_error",
                    "neg_sum_squared_error",
                    "weighted_neg_mean_squared_error",
                }
            )
        ],
        "refit": [
            StrOptions(
                {
                    "mean_debiased",
                    "mean_penalized",
                    "1se_debiased",
                    "1se_penalized",
                }
            )
        ],
        "init": [StrOptions({"random", "custom"})],
        "init_kwargs": [dict, None],
        "rho": [Interval(Real, 0.0, None, closed="neither"), None],
        "alpha": [Interval(Real, 0.0, None, closed="left"), None],
        "mu": [Interval(Real, 0.0, None, closed="neither"), None],
        "max_iter": [Interval(Integral, 1, None, closed="left")],
        "abs_tol": [Interval(Real, 0.0, None, closed="neither")],
        "rel_tol": [Interval(Real, 0.0, None, closed="neither")],
        "verbose": ["boolean"],
        "n_jobs": [Integral, None],
    }

    def __init__(
        self,
        *,
        structure_penalty: float | ArrayLike = 1.0,
        max_rank: int | ArrayLike = 10,
        factor_penalty: float | ArrayLike | None = None,
        factor_pruning: bool = True,
        cv: int | BaseSplitter = 10,
        cv_strategy: str = "structure_first_debiased_cv",
        score: str = "neg_mean_squared_error",
        refit: str = "1se_debiased",
        init: str = "random",
        init_kwargs: dict | None = None,
        rho: float | None = None,
        alpha: float | None = None,
        mu: float | None = None,
        max_iter: int = 1000,
        abs_tol: float = 1e-6,
        rel_tol: float = 1e-6,
        verbose: bool = False,
        n_jobs: int | None = None,
    ):
        self.structure_penalty = structure_penalty
        self.max_rank = max_rank
        self.factor_penalty = factor_penalty
        self.factor_pruning = factor_pruning
        self.cv = cv
        self.cv_strategy = cv_strategy
        self.score = score
        self.refit = refit
        self.init = init
        self.init_kwargs = init_kwargs
        self.rho = rho
        self.alpha = alpha
        self.mu = mu
        self.max_iter = max_iter
        self.abs_tol = abs_tol
        self.rel_tol = rel_tol
        self.verbose = verbose
        self.n_jobs = n_jobs

    def _check_parameter_grid(self):
        # Scalars to 1d-arrays
        structure_penalty, max_rank, factor_penalty = atleast_1d(
            self.structure_penalty, self.max_rank, self.factor_penalty
        )

        # Check that all are indeed 1d
        assert (
            ndim(structure_penalty)
            == ndim(max_rank)
            == ndim(factor_penalty)
            == 1
        ), (
            f"In {self.__class__.__name__} arguments 'structure_penalty',"
            " 'max_rank', and 'factor_penalty' need to be one-dimensional or"
            " equal to a single number (or 'None' for 'factor_penalty')"
        )

        structure_penalty, max_rank, factor_penalty = broadcast_arrays(
            structure_penalty, max_rank, factor_penalty
        )

        return list(zip(structure_penalty, max_rank, factor_penalty))

    def fit(
        self,
        X: dict[ViewDesc, NDArray[float64]],
        y=None,
        *,
        structure_weights: (
            dict[ViewDesc, NDArray[float64] | float64] | None
        ) = None,
        factor_weights: dict[Hashable, NDArray[float64] | float64]
        | None = None,
        vs: list[dict[Hashable, NDArray[float64]]] | None = None,
        ds: list[dict[ViewDesc, NDArray[float64]]] | None = None,
        us: list[dict[Hashable, NDArray[float64]]] | None = None,
    ):
        self._validate_params()

        parameter_grid = self._check_parameter_grid()

        n_params = len(parameter_grid)

        if isinstance(self.cv, Integral):
            cv = ElementwiseFolds(self.cv)
        elif isinstance(self.cv, BaseSplitter):
            cv = self.cv

        if self.score == "neg_mean_squared_error":
            score_fn = neg_mean_squared_error
        elif self.score == "neg_sum_squared_error":
            score_fn = neg_sum_squared_error
        elif self.score == "weighted_neg_mean_squared_error":
            score_fn = weighted_neg_mean_squared_error

        results = {
            "structure_penalty": [s for s, _, _ in parameter_grid],
            "max_rank": [m for _, m, _ in parameter_grid],
            "factor_penalty": [f for _, _, f in parameter_grid],
        }

        if self.init_kwargs is None:
            init_kwargs = {}
        else:
            init_kwargs = self.init_kwargs

        # If one of these is provided all need to be the same length
        # (if only vs and ds are provided then us is a list of None)
        if vs is not None or ds is not None or us is not None:
            assert (
                vs is not None and ds is not None and len(vs) == len(ds) >= 1
            ), (
                "If initial values are provided to"
                f" {self.__class__.__name__}.fit(), then 'vs' and 'ds' both"
                " need to provided and have to be the same length"
            )

            assert us is None or len(us) == len(vs), (
                "If initial values for 'u' are provided to"
                f" {self.__class__.__name__}.fit(), then 'us' needs have the"
                " same length as 'vs' and 'ds'"
            )

        if self.init == "random":
            n_reps = 1
            if "repetitions" in init_kwargs:
                n_reps = init_kwargs.pop("repetitions")

            def inits():
                for i in range(n_reps):
                    yield i, (None, None, None)

            # If an rng or seed is supplied, extract it
            if "rng" in init_kwargs:
                rng = default_rng(init_kwargs["rng"])
            else:
                rng = default_rng()

        elif self.init == "custom":
            n_reps = len(vs)

            def inits():
                for i in range(n_reps):
                    yield i, (vs[i], ds[i], us[i] if us is not None else None)

        else:
            raise ValueError(f"Unknown init method {self.init}")

        base_est = SolrCMF(
            factor_pruning=self.factor_pruning,
            init=self.init,
            init_kwargs=init_kwargs,
            rho=self.rho,
            alpha=self.alpha,
            mu=self.mu,
            max_iter=self.max_iter,
            abs_tol=self.abs_tol,
            rel_tol=self.rel_tol,
        )

        if self.cv_strategy == "structure_first_debiased_cv":
            tmpdir = TemporaryDirectory()
            tmppath = Path(tmpdir.name)

            def _estimate_structure(
                idx_params,
                idx_init,
                structure_penalty,
                max_rank,
                factor_penalty,
                vs,
                ds,
                us,
                rng,
            ):
                est: SolrCMF = clone(base_est)
                est.set_params(
                    structure_penalty=structure_penalty,
                    max_rank=max_rank,
                    factor_penalty=factor_penalty,
                )

                if est.init == "random":
                    est.init_kwargs["rng"] = default_rng(rng)

                est.fit(
                    X,
                    structure_weights=structure_weights,
                    factor_weights=factor_weights,
                    vs=vs,
                    ds=ds,
                    us=us,
                )

                if not est.converged_:
                    warn(
                        "Penalized estimation with parameters"
                        f" (structure_penalty={structure_penalty},"
                        f" max_rank={max_rank},"
                        f" factor_penalty={factor_penalty}):"
                        f" {est.__class__.__name__} did not converge"
                        f" after {est.n_iter_} iterations."
                    )

                # Save estimator for later
                dump(est, tmppath / f"{idx_params}_{idx_init}.pkl")

                return (
                    est.objective_value_,
                    est.elapsed_process_time_,
                    est.est_max_rank_,
                    # compute relative to supplied max_rank;
                    # est_max_rank_ could be less in case of
                    # factor pruning
                    (max_rank - est.est_max_rank_) * len(X)
                    + sum(
                        [sum(1 - p) for p in est.structure_pattern().values()]
                    ),
                    (
                        sum(
                            [
                                (max_rank - est.est_max_rank_)
                                * (p.shape[0] - 1)
                                + sum(1 - p)
                                for p in est.factor_pattern().values()
                            ]
                        )
                        if factor_penalty is not None
                        else 0
                    ),
                )

            if self.verbose:
                print(
                    f"Perform structure estimation ({n_reps * n_params} tasks)"
                )

            if self.init == "random":
                # We need to split the randomness for random initialization
                child_states = reshape(
                    rng.bit_generator._seed_seq.spawn(n_reps * n_params),
                    (n_params, n_reps),
                )
            else:
                # Dummy otherwise
                child_states = full((n_params, n_reps), None)

            out = Parallel(
                n_jobs=self.n_jobs, verbose=10 if self.verbose else 0
            )(
                delayed(_estimate_structure)(
                    idx_params,
                    idx_init,
                    structure_penalty,
                    max_rank,
                    factor_penalty,
                    vs,
                    ds,
                    us,
                    child_states[idx_params, idx_init],
                )
                for idx_params, (
                    structure_penalty,
                    max_rank,
                    factor_penalty,
                ) in enumerate(parameter_grid)
                for idx_init, (vs, ds, us) in inits()
            )

            (
                objective_values,
                elapsed_process_times,
                est_max_rank,
                structural_zeros,
                factor_zeros,
            ) = zip(*out)

            if self.verbose:
                print("Determine best runs")

            # Rely on the fact that joblib returns results in the same
            # order as the inputs
            objective_values = split(asarray(objective_values), n_params)
            best_runs = [int(argmin(vals)) for vals in objective_values]
            results["objective_value_penalized"] = [
                vals[idx] for idx, vals in zip(best_runs, objective_values)
            ]

            elapsed_process_times = split(
                asarray(elapsed_process_times), n_params
            )
            results["mean_elapsed_process_time_penalized"] = [
                mean(ts) for ts in elapsed_process_times
            ]
            results["std_elapsed_process_time_penalized"] = [
                std(ts) for ts in elapsed_process_times
            ]

            est_max_rank = split(asarray(est_max_rank), n_params)
            results["est_max_rank"] = [
                rks[idx] for idx, rks in zip(best_runs, est_max_rank)
            ]
            structural_zeros = split(asarray(structural_zeros), n_params)
            results["structural_zeros"] = [
                zs[idx] for idx, zs in zip(best_runs, structural_zeros)
            ]
            factor_zeros = split(asarray(factor_zeros), n_params)
            results["factor_zeros"] = [
                zs[idx] for idx, zs in zip(best_runs, factor_zeros)
            ]

            def _debiased_cv_score(
                est_in: SolrCMF,
                train_indices: dict[ViewDesc, NDArray[intp]],
                test_indices: dict[ViewDesc, NDArray[intp]],
            ):
                est: SolrCMF = clone(base_est)
                est.set_params(
                    init="custom",
                    init_kwargs={"reduce_max_rank": True},
                    factor_pruning=False,  # Set to False always
                )
                est.fit(
                    X,
                    indices=train_indices,
                    structure_pattern=est_in.structure_pattern(),
                    factor_pattern=est_in.factor_pattern(),
                    vs=est_in.vs_,
                    ds=est_in.ds_,
                    us=est_in.us_ if hasattr(est_in, "us_") else None,
                )

                if not est.converged_:
                    warn(
                        "Fixed structure estimation of"
                        f" {est.__class__.__name__} did not converge after"
                        f" {est.n_iter_} iterations."
                    )

                return (
                    score_fn(X, est.transform(X), indices=test_indices),
                    est.elapsed_process_time_,
                )

            # Reads fitted penalized estimators from cache and
            # extracts structure/factor patterns
            def solrcmf_estimators():
                for idx_params, idx_init in zip(range(n_params), best_runs):
                    yield load(tmppath / f"{idx_params}_{idx_init}.pkl")

            # We want exactly the same splits for all parameter combinations,
            # so we produce the splits once and then reuse them.
            cv_splits = list(cv.split(X))
            n_folds = cv.get_n_splits(X)

            if self.verbose:
                print(
                    "Perform debiased cross-validation"
                    f" ({n_params * n_folds} tasks)"
                )

            out = Parallel(
                n_jobs=self.n_jobs, verbose=10 if self.verbose else 0
            )(
                delayed(_debiased_cv_score)(
                    est,
                    train_indices,
                    test_indices,
                )
                for est in solrcmf_estimators()
                for train_indices, test_indices in cv_splits
            )
            (
                scores,
                elapsed_process_times,
            ) = zip(*out)

            for i in range(n_folds):
                results[f"{self.score}_fold{i}"] = [
                    scores[j * n_folds + i] for j in range(n_params)
                ]

            elapsed_process_times = split(
                asarray(elapsed_process_times), n_params
            )
            results["mean_elapsed_process_time_fixed"] = [
                mean(ts) for ts in elapsed_process_times
            ]
            results["std_elapsed_process_time_fixed"] = [
                std(ts) for ts in elapsed_process_times
            ]
        elif self.cv_strategy == "penalized_cv":

            def _penalized_cv_score(
                structure_penalty,
                max_rank,
                factor_penalty,
                vs,
                ds,
                us,
                train_indices,
                test_indices,
                rng,
            ):
                est: SolrCMF = clone(base_est)
                est.set_params(
                    structure_penalty=structure_penalty,
                    max_rank=max_rank,
                    factor_penalty=factor_penalty,
                )

                if est.init == "random":
                    est.init_kwargs["rng"] = default_rng(rng)

                est.fit(
                    X,
                    indices=train_indices,
                    structure_weights=structure_weights,
                    factor_weights=factor_weights,
                    vs=vs,
                    ds=ds,
                    us=us,
                )

                if not est.converged_:
                    warn(
                        "Penalized estimation with parameters"
                        f" (structure_penalty={structure_penalty},"
                        f" max_rank={max_rank},"
                        f" factor_penalty={factor_penalty}):"
                        f" {est.__class__.__name__} did not converge"
                        f" after {est.n_iter_} iterations."
                    )

                return (
                    score_fn(X, est.transform(X), indices=test_indices),
                    est.elapsed_process_time_,
                    est.est_max_rank_,
                    # compute relative to supplied max_rank;
                    # est_max_rank_ could be less in case of
                    # factor pruning
                    (max_rank - est.est_max_rank_) * len(X)
                    + sum(
                        [sum(1 - p) for p in est.structure_pattern().values()]
                    ),
                    (
                        sum(
                            [
                                (max_rank - est.est_max_rank_)
                                * (p.shape[0] - 1)
                                + sum(1 - p)
                                for p in est.factor_pattern().values()
                            ]
                        )
                        if factor_penalty is not None
                        else 0
                    ),
                )

            # We want exactly the same splits for all parameter combinations,
            # so we produce the splits once and then reuse them.
            cv_splits = list(cv.split(X))
            n_folds = cv.get_n_splits(X)

            if self.verbose:
                print(
                    "Perform penalized cross-validation"
                    f" ({n_params * n_reps * n_folds} tasks)"
                )

            if self.init == "random":
                # We need to split the randomness for random initialization
                child_states = reshape(
                    rng.bit_generator._seed_seq.spawn(
                        n_reps * n_params * n_folds
                    ),
                    (n_params, n_reps, n_folds),
                )
            else:
                # Dummy otherwise
                child_states = full((n_params, n_reps, n_folds), None)

            out = Parallel(
                n_jobs=self.n_jobs, verbose=10 if self.verbose else 0
            )(
                delayed(_penalized_cv_score)(
                    structure_penalty,
                    max_rank,
                    factor_penalty,
                    vs,
                    ds,
                    us,
                    train_indices,
                    test_indices,
                    child_states[idx_param, idx_init, idx_fold],
                )
                for idx_param, (
                    structure_penalty,
                    max_rank,
                    factor_penalty,
                ) in enumerate(parameter_grid)
                for idx_init, (vs, ds, us) in inits()
                for idx_fold, (train_indices, test_indices) in enumerate(
                    cv_splits
                )
            )

            (
                scores,
                elapsed_process_times,
                est_max_rank,
                structural_zeros,
                factor_zeros,
            ) = zip(*out)

            for i in range(n_folds):
                results[f"{self.score}_fold{i}"] = [nan] * n_params

            best_runs = [-1] * n_params
            best_score = [inf] * n_params
            for idx_params, scores_params in enumerate(
                split(asarray(scores), n_params)
            ):
                for idx_init, scores_inits in enumerate(
                    split(scores_params, n_reps)
                ):
                    if mean(scores_inits) < best_score[idx_params]:
                        best_score[idx_params] = mean(scores_inits)
                        best_runs[idx_params] = idx_init
                        for i in range(n_folds):
                            results[f"{self.score}_fold{i}"][
                                idx_params
                            ] = scores_inits[i]

            elapsed_process_times = split(
                asarray(elapsed_process_times), n_params
            )
            results["mean_elapsed_process_time"] = [
                mean(ts) for ts in elapsed_process_times
            ]
            results["std_elapsed_process_time"] = [
                std(ts) for ts in elapsed_process_times
            ]

            results["est_max_rank"] = [
                mean(
                    est_max_rank[
                        (
                            idx_params * n_reps * n_folds
                            + best_runs[idx_params] * n_folds
                        ) : (
                            idx_params * n_reps * n_folds
                            + (best_runs[idx_params] + 1) * n_folds
                        )
                    ]
                )
                for idx_params in range(n_params)
            ]
            results["structural_zeros"] = [
                mean(
                    structural_zeros[
                        (
                            idx_params * n_reps * n_folds
                            + best_runs[idx_params] * n_folds
                        ) : (
                            idx_params * n_reps * n_folds
                            + (best_runs[idx_params] + 1) * n_folds
                        )
                    ]
                )
                for idx_params in range(n_params)
            ]

            results["factor_zeros"] = [
                mean(
                    factor_zeros[
                        (
                            idx_params * n_reps * n_folds
                            + best_runs[idx_params] * n_folds
                        ) : (
                            idx_params * n_reps * n_folds
                            + (best_runs[idx_params] + 1) * n_folds
                        )
                    ]
                )
                for idx_params in range(n_params)
            ]

        # Post-processing on the full dictionary. Same for both cases
        scores = vstack(
            [results[f"{self.score}_fold{i}"] for i in range(n_folds)]
        )
        results.update(
            {
                f"mean_{self.score}": scores.mean(0),
                f"std_{self.score}": scores.std(0),
            }
        )

        self.cv_results_ = results

        if self.verbose:
            print("Re-fit final estimator")

        if self.refit.startswith("mean"):
            self.best_index_ = argmax(results[f"mean_{self.score}"])
        elif self.refit.startswith("1se"):
            # Choose the solution with maximal structure sparsity within
            # 1 standard error of the best solution
            max_index = argmax(results[f"mean_{self.score}"])

            candidates = flatnonzero(
                results[f"mean_{self.score}"]
                >= (
                    results[f"mean_{self.score}"][max_index]
                    - results[f"std_{self.score}"][max_index]
                )
            )

            # Primarily choose the solution with the most
            # structural zeros and then select the solution with the
            # most factor zeros if factor sparsity was requested
            structural_zeros = asarray(
                [results["structural_zeros"][i] for i in candidates]
            )
            most_sz_candidates = candidates[
                flatnonzero(structural_zeros == max(structural_zeros))
            ]

            factor_zeros = [
                results["factor_zeros"][i] for i in most_sz_candidates
            ]

            self.best_index_ = most_sz_candidates[argmax(factor_zeros)]

        structure_penalty, max_rank, factor_penalty = parameter_grid[
            self.best_index_
        ]

        if self.verbose:
            print(
                "Best fit with\n"
                f"  structure_penalty = {structure_penalty}\n"
                f"  max_rank = {max_rank}\n"
                f"  factor_penalty = {factor_penalty}\n\n"
                "  estimated max_rank = "
                f"{results['est_max_rank'][self.best_index_]}"
            )

        # Re-fit best run on all data
        if self.cv_strategy == "structure_first_debiased_cv":
            # Load respective penalized estimator from cache
            est = load(
                tmppath
                / f"{self.best_index_}_{best_runs[self.best_index_]}.pkl"
            )

            self.best_estimator_ = clone(base_est)

            if self.refit.endswith("debiased"):
                self.best_estimator_.set_params(
                    init="custom",
                    init_kwargs={"reduce_max_rank": True},
                    factor_pruning=False,  # Set to False always
                )
                self.best_estimator_.fit(
                    X,
                    structure_pattern=est.structure_pattern(),
                    factor_pattern=est.factor_pattern(),
                    vs=est.vs_,
                    ds=est.ds_,
                    us=est.us_ if hasattr(est, "us_") else None,
                )
            elif self.refit.endswith("penalized"):
                self.best_estimator_.set_params(
                    structure_penalty=structure_penalty,
                    max_rank=max_rank,
                    factor_penalty=factor_penalty,
                    init="custom",
                )
                self.best_estimator_.fit(
                    X,
                    vs=est.vs_,
                    ds=est.ds_,
                    us=est.us_ if hasattr(est, "us_") else None,
                )

            tmpdir.cleanup()
        elif self.cv_strategy == "penalized_cv":
            # A penalized fit needs to be performed irrespectively.
            # Either because it is the final fit or because we need the
            # structure/factor pattern.
            if self.init == "custom":
                vs_init = vs[best_runs[self.best_index_]]
                ds_init = ds[best_runs[self.best_index_]]
                if us is not None:
                    us_init = us[best_runs[self.best_index_]]
                else:
                    us_init = None
            else:
                vs_init = None
                ds_init = None
                us_init = None

            final_est = clone(base_est)
            final_est.set_params(
                structure_penalty=structure_penalty,
                max_rank=max_rank,
                factor_penalty=factor_penalty,
            )
            final_est.fit(
                X,
                vs=vs_init,
                ds=ds_init,
                us=us_init,
            )

            if self.refit.endswith("debiased"):
                final_est_debiased = clone(base_est)
                final_est_debiased.set_params(
                    init="custom",
                    init_kwargs={"reduce_max_rank": True},
                    factor_pruning=False,  # Set to False always
                )
                final_est_debiased.fit(
                    X,
                    structure_pattern=final_est.structure_pattern(),
                    factor_pattern=final_est.factor_pattern(),
                    vs=final_est.vs_,
                    ds=final_est.ds_,
                    us=final_est.us_ if hasattr(final_est, "us_") else None,
                )

                self.best_estimator_ = final_est_debiased
            elif self.refit.endswith("penalized"):
                self.best_estimator_ = final_est

        self.best_max_rank_ = self.best_estimator_.est_max_rank_

        return self

Intialization

multiview_init

Source code in solrcmf/initstrategies.py

def multiview_init(
    xs: dict[ViewDesc, NDArray[float64]],
    max_rank: int,
) -> tuple[dict[Hashable, NDArray[float64]], dict[ViewDesc, NDArray[float64]]]:
    layout = list(xs.keys())
    if len({k[0] for k in layout}) == 1:
        x_joint = hstack([x for x in xs.values()]).T
        jx = 0
        ix = 1
    elif len({k[1] for k in layout}) == 1:
        x_joint = vstack([x for x in xs.values()])
        jx = 1
        ix = 0
    else:
        raise ValueError("'xs' does not follow a multiview layout")

    u, _, vt = svd(x_joint)

    vs = {layout[0][jx]: vt.T[:, :max_rank]}

    current = 0
    for k, x in xs.items():
        vs.update({k[ix]: u[current : current + x.shape[ix], :max_rank]})
        current += x.shape[ix]

    ds = {k: diag(vs[k[0]].T @ x @ vs[k[1]]) for k, x in xs.items()}

    return vs, ds

best_random_init

Source code in solrcmf/initstrategies.py

def best_random_init(
    xs: dict[ViewDesc, NDArray[float64]],
    max_rank: int,
    *,
    n_inits: int = 1,
    n_jobs: int = -1,
    rng: Generator | int | None = None,
    **kwargs,
) -> SolrCMF:
    assert n_inits > 0, "`n_inits` needs to be a positive integer"

    rng = default_rng(rng)

    def init_run(
        xs: dict[ViewDesc, NDArray[float64]], rng: Generator
    ) -> SolrCMF:
        return SolrCMF(
            structure_penalty=0.0,
            max_rank=max_rank,
            factor_pruning=False,
            init="random",
            init_kwargs={"rng": rng},
            **kwargs,
        ).fit(xs)

    rng_inits = rng.spawn(n_inits)

    ests_init: list[SolrCMF] = Parallel(n_jobs=n_jobs)(
        delayed(init_run)(xs, ri) for ri in rng_inits
    )

    best_obj = inf

    for i in range(n_inits):
        if ests_init[i].objective_value_ < best_obj:
            best_obj = ests_init[i].objective_value_
            best_est_init = ests_init[i]

    return best_est_init

Synthetic data generation

simulate

Auxillary methods

LowRankImputation

Bases: BaseEstimator

Source code in solrcmf/lrimpute.py

class LowRankImputation(BaseEstimator):
    _parameter_constraints = {
        "penalty": [Interval(Real, 0, None, closed="left")],
        "max_rank": [Interval(Integral, 1, None, closed="left")],
        "init": [StrOptions({"random", "custom"})],
        "warm_start": ["boolean"],
        "max_iter": [Interval(Integral, 1, None, closed="left")],
        "tol": [Interval(Real, 0, None, closed="left")],
        "random_state": ["random_state"],
    }

    def __init__(
        self,
        *,
        penalty: float = 1.0,
        max_rank: int = 10,
        init: str = "random",
        warm_start: bool = False,
        max_iter: int = 1000,
        tol: float = 1e-6,
        random_state: int | RandomState | None = None,
    ):
        self.penalty = penalty
        self.max_rank = max_rank
        self.init = init
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state

    def _more_tags(self):
        return {"allow_nan": True}

    def fit(self, X, y=None, *, U=None, V=None):
        self._validate_params()

        X = check_array(
            X, dtype=[float64, float32], force_all_finite="allow-nan"
        )

        U, V = _initialize(
            self.max_rank,
            self.init,
            self.random_state,
            X,
            U,
            V,
        )

        penalty = self.penalty
        max_iter = self.max_iter
        tol = self.tol

        loss_old = _compute_loss(X, U, V, penalty)

        converged = False
        for i in range(max_iter):
            # We will solve
            # min_{u, v} 0.5 sum_{i, j obs.} (x^(i, j) - u^(i, :) v^(j, :))^2
            #            + lambda / 2 * ||u||_F^2
            #            + lambda / 2 * ||v||_F^2

            # Given fixed v this is a ridge regression problem for each
            # u^(i, :) for a subset of the rows of v
            for r in range(X.shape[0]):
                indices = flatnonzero(1 - isnan(X[r, :]))
                A = V[indices, :].T @ V[indices, :]
                fill_diagonal(A, diagonal(A) + penalty)
                b = V[indices, :].T @ X[r, :][indices]
                U[r, :] = solve(A, b)

            # Given fixed u this is a ridge regression problem for each
            # v^(j, :) for a subset of the rows of u
            for c in range(X.shape[1]):
                indices = flatnonzero(1 - isnan(X[:, c]))
                A = U[indices, :].T @ U[indices, :]
                fill_diagonal(A, diagonal(A) + penalty)
                b = U[indices, :].T @ X[:, c][indices]
                V[c, :] = solve(A, b)

            loss = _compute_loss(X, U, V, penalty)

            if (loss_old - loss) < tol * loss_old:
                converged = True
                break

            loss_old = loss

        self.converged_ = converged
        self.U_ = U
        self.V_ = V
        self.n_iter_ = i + 1
        self.n_features_in_ = X.shape[1]
        self.loss_ = loss

        return self

bicenter

Bicenter the input matrix allowing for missing values.

Computes a total mean as well as row and column means.

Implements the centering algorithm described in

Hastie et al. (2015) Matrix completion and low-rank SVD via fast alternating least squares. Journal of Machine Learning Research, 16(104):3367--3402, 2015.

Parameters

X : ndarray The input matrix tol : float Convergence tolerance max_iter : int Maximum number of iterations to perform.

Returns

(Y, m, rm, cm) : (ndarray, float, ndarray, ndarray) Returns the bi-centered matrix, the overall mean, as well as row-means and column-means.

Source code in solrcmf/preprocess.py

def bicenter(X: NDArray[float64], tol: float = 1e-16, max_iter: int = 10):
    """Bicenter the input matrix allowing for missing values.

    Computes a total mean as well as row and column means.

    Implements the centering algorithm described in

    Hastie et al. (2015) Matrix completion and low-rank SVD via fast
    alternating least squares. Journal of Machine Learning Research,
    16(104):3367--3402, 2015.

    Parameters
    ----------
    X : ndarray
        The input matrix
    tol : float
        Convergence tolerance
    max_iter : int
        Maximum number of iterations to perform.

    Returns
    -------
    (Y, m, rm, cm) : (ndarray, float, ndarray, ndarray)
        Returns the bi-centered matrix, the overall mean, as well as
        row-means and column-means.
    """
    X = check_array(X, force_all_finite="allow-nan")
    assert tol > 0, "'tol' needs to be positive"
    assert (
        isinstance(max_iter, Integral) and max_iter > 0
    ), "'max_iter' needs to be a positive integer"

    n, p = X.shape

    mask = logical_not(isnan(X))
    indices = flatnonzero(mask)
    row_indices = [flatnonzero(mask[i, :]) for i in range(n)]
    col_indices = [flatnonzero(mask[:, i]) for i in range(p)]

    # Initialization
    total_mean = mean(X.flat[indices])
    row_means = array(
        [
            mean(X[i, :].flat[idx]) if len(idx) > 0 else 0.0
            for i, idx in enumerate(row_indices)
        ]
    )[:, None]
    col_means = array(
        [
            mean(X[:, i].flat[idx]) if len(idx) > 0 else 0.0
            for i, idx in enumerate(col_indices)
        ]
    )[None, :]

    # Iterate
    for it in range(max_iter):
        total_mean = mean(
            X.flat[indices] - (row_means + col_means).flat[indices]
        )
        row_means = array(
            [
                (
                    mean(
                        X[i, :].flat[idx] - (total_mean + col_means).flat[idx]
                    )
                    if len(idx) > 0
                    else 0.0
                )
                for i, idx in enumerate(row_indices)
            ]
        )[:, None]
        col_means = array(
            [
                (
                    mean(
                        X[:, i].flat[idx] - (total_mean + row_means).flat[idx]
                    )
                    if len(idx) > 0
                    else 0.0
                )
                for i, idx in enumerate(col_indices)
            ]
        )[None, :]
        r_crit = _residual(
            X,
            indices,
            row_indices,
            col_indices,
            total_mean,
            row_means,
            col_means,
        )

        if r_crit <= tol:
            break

    if it + 1 == max_iter:
        warn(f"Bi-centering did not converge in {max_iter} iterations")

    Y = X.copy()
    Y[mask] -= (total_mean + row_means + col_means)[mask]

    return Y, total_mean, row_means, col_means

nanscale

Source code in solrcmf/preprocess.py

def nanscale(X: NDArray[floating[Any]], scale: float):
    Y = full(X.shape, nan, dtype=X.dtype)
    divide(
        X,
        scale,
        out=Y,
        where=logical_not(isnan(X)),
    )

    return Y