Source code for simplestatistics.statistics.bayesian_classifier

"""
Implements bayesian_classifier().
"""

# I need sane division that returns a float not int
from __future__ import division

[docs]class bayesian_classifier():
    """
    A `naive Bayesian classifier`_.

    The implemention of this classifier is very closely modeled after the implementation
    in `simple-statistics <https://github.com/simple-statistics/simple-statistics>`_,
    the javascript analogue of ``simplestatistics``. You can find the javascript implementation
    of the Bayesian classifier `here <https://github.com/simple-statistics/\
    simple-statistics/blob/master/src/bayesian_classifier.js>`_.

    .. _`naive Bayesian classifier`: https://en.wikipedia.org/wiki/Naive_Bayes_classifier

    Examples:
        Making a seventy-five/twenty-five classification.

        >>> model1 = bayesian_classifier()
        >>> model1.train({'species': 'cat'}, 'animal')
        >>> model1.train({'species': 'cat'}, 'animal')
        >>> model1.train({'species': 'cat'}, 'animal')
        >>> model1.train({'species': 'cat'}, 'feline')
        >>> model1.count
        4
        >>> model1.score({'species': 'cat'})
        [('animal', 0.75), ('feline', 0.25)]

        Classifying multiple things

        >>> model2 = bayesian_classifier()
        >>> model2.train({'species': 'cat'}, 'animal')
        >>> model2.train({'species': 'dog'}, 'animal')
        >>> model2.train({'species': 'dog'}, 'animal')
        >>> model2.train({'species': 'cat'}, 'chair')
        >>> model2.score({'species': 'cat'})
        [('chair', 0.25), ('animal', 0.25)]
        >>> model2.score({'species': 'dog'})
        [('animal', 0.5), ('chair', 0)]

        Testing multiple properties

        >>> model3 = bayesian_classifier()
        >>> model3.train({'species': 'cat'}, 'animal')
        >>> model3.train({'species': 'cat'}, 'animal')
        >>> model3.train({'species': 'cat'}, 'animal')
        >>> model3.train({'species': 'cat'}, 'chair')
        >>> model3.train({'species': 'cat', 'color': 'white'}, 'chair')
        >>> model3.score({'color': 'white'})
        [('chair', 0.2), ('animal', 0)]

        >>> mod = bayesian_classifier()
        >>> mod.score({'color': 'purple'}) # doctest: +ELLIPSIS
        Traceback (most recent call last):
            ...
        RuntimeError: The model has not been trained yet. Train the model ... item.
    """

    def __init__(self):
        # number of items the model trained on
        self.count = 0
        # storage for all property and category keys and values
        self.store = {}

[docs]    def train(self, item, category):
        """
        The method to train the instance of the Bayesian classifier
        on an item.

        Args:
            item: A dict of property-value pairs in the form {property_1: value1,
            property2: value2, . . .} for the item.  category: A string of the
            category of the item.

        Returns:
            null
        """

        # if the data store doesn't contain this category,
        # create a new key with the value being an empty
        # dictionary
        if category not in self.store:
            self.store[category] = {}

        # iterate through key/property-value pairs in item
        for item_key in item:
            item_value = item[item_key]

            # if it's the first time the model sees this property for this category,
            # add a dictionary for it
            if item_key not in self.store[category]:
                self.store[category][item_key] = {}

            # if this is the first time the model sees a value for this property + category,
            # add an entry in the dict for that property and set value to 0
            if item_value not in self.store[category][item_key]:
                self.store[category][item_key][item_value] = 0

            # increment the value tied to the property + category
            self.store[category][item_key][item_value] += 1

        # increment count of trained items
        self.count += 1

[docs]    def score(self, item):
        """
        Scores a certain item based on the learning the model has done
        on previous items.

        Args:
            item: A dict in the form {property: value} of the item you want to score.

        Returns:
            A list containing tuples of properties and scores. The list is ordered
            in descending order of scores.
        """

        if self.store == {}:
            raise RuntimeError('The model has not been trained yet. Train the '
                               'model before trying to score an item.')

        # iterate through each key in the item to be scored
        # and then iterate through each category used in previous
        # .train() calls

        odds = {}
        odds_sums = {}

        # iterate over properties and their values in the item to score
        for item_key in item:
            # item_key is the property (e.g., color), item_value is value of property (e.g., white)
            item_value = item[item_key]

            # iterate over all categories that the model has trained on in the past
            for category in self.store:
                # if we haven't added this category to our odds dict, add it with a dict as value
                if category not in odds:
                    odds[category] = {}

                if item_key in self.store[category]:
                    # if the model has seen this value for this property for this category before
                    # return the number of times divided by total training trials
                    # we get a score for each unique property+value combination within a category
                    if item_value in self.store[category][item_key]:
                        odds[category][item_key + '_' + item_value] = \
                                self.store[category][item_key][item_value] / self.count

                    # otherwise, mark it zero.
                    else:
                        odds[category][item_key + '_' + item_value] = 0
                # MARK IT ZERO.
                else:
                    odds[category][item_key + '_' + item_value] = 0

        # iterate over all categories in odds dict and sum up prob values
        # for unique property+value combinations
        for category in odds:
            for combination in odds[category]:
                if combination not in odds_sums:
                    odds_sums[category] = 0

                odds_sums[category] += odds[category][combination]

        # return a list of properties and scores in descending order of scores
        return(sorted(odds_sums.items(), key=lambda x: x[1], reverse=True))