Source code for chemicalx.data.labeledtriples

"""A module for the labeled triples class."""

from typing import ClassVar, Iterable, Mapping, Optional, Sequence, Tuple, Union

import pandas as pd
from sklearn.model_selection import train_test_split

__all__ = ["LabeledTriples"]


[docs]class LabeledTriples: """Labeled triples for drug pair scoring.""" columns: ClassVar[Sequence[str]] = ("drug_1", "drug_2", "context", "label") dtype: ClassVar[Mapping[str, type]] = {"drug_1": str, "drug_2": str, "context": str, "label": float} def __init__(self, data: Union[pd.DataFrame, Iterable[Sequence]]): """Initialize the labeled triples object.""" if not isinstance(data, pd.DataFrame): data = pd.DataFrame(data, columns=self.columns).astype(self.dtype) self.data = data def __len__(self) -> int: """Get the number of triples.""" return len(self.data.index)
[docs] def drop_duplicates(self): """Drop the duplicated entries.""" self.data = self.data.drop_duplicates()
def __add__(self, value: "LabeledTriples") -> "LabeledTriples": """ Add the triples in two LabeledTriples objects together - syntactic sugar for '+'. :param value: Another LabeledTriples object for the addition. :returns: A LabeledTriples object after the addition. """ return LabeledTriples(pd.concat([self.data, value.data]))
[docs] def get_drug_count(self) -> int: """Get the number of drugs in the labeled triples dataset.""" return pd.unique(self.data[["drug_1", "drug_2"]].values.ravel("K")).shape[0]
[docs] def get_context_count(self) -> int: """Get the number of unique contexts in the labeled triples dataset.""" return self.data["context"].nunique()
[docs] def get_combination_count(self) -> int: """Get the number of unique drug pairs in the labeled triples dataset.""" combination_count = self.data[["drug_1", "drug_2"]].drop_duplicates().shape[0] return combination_count
[docs] def get_labeled_triple_count(self) -> int: """Get the number of triples in the labeled triples dataset.""" triple_count = self.data.shape[0] return triple_count
[docs] def get_positive_count(self) -> int: """Get the number of positive triples in the dataset.""" return int(self.data["label"].sum())
[docs] def get_negative_count(self) -> int: """Get the number of negative triples in the dataset.""" return self.get_labeled_triple_count() - self.get_positive_count()
[docs] def get_positive_rate(self) -> float: """Get the ratio of positive triples in the dataset.""" return self.data["label"].mean()
[docs] def get_negative_rate(self) -> float: """Get the ratio of positive triples in the dataset.""" return 1.0 - self.data["label"].mean()
[docs] def train_test_split( self, train_size: Optional[float] = None, random_state: Optional[int] = 42 ) -> Tuple["LabeledTriples", "LabeledTriples"]: """ Split the LabeledTriples object for training and testing. :param train_size: The ratio of training triples. Default is 0.8 if None is passed. :param random_state: The random seed. Default is 42. Set to none for no fixed seed. :returns: A pair of training triples and testing triples """ train_data, test_data = train_test_split(self.data, train_size=train_size or 0.8, random_state=random_state) return LabeledTriples(train_data), LabeledTriples(test_data)