# Source code for simplestatistics.statistics.bayesian_classifier

"""
Implements bayesian_classifier().
"""

# I need sane division that returns a float not int
from __future__ import division

[docs]class bayesian_classifier():
"""
A naive Bayesian classifier_.

The implemention of this classifier is very closely modeled after the implementation
in simple-statistics <https://github.com/simple-statistics/simple-statistics>_,
the javascript analogue of simplestatistics. You can find the javascript implementation
of the Bayesian classifier here <https://github.com/simple-statistics/\
simple-statistics/blob/master/src/bayesian_classifier.js>_.

.. _naive Bayesian classifier: https://en.wikipedia.org/wiki/Naive_Bayes_classifier

Examples:
Making a seventy-five/twenty-five classification.

>>> model1 = bayesian_classifier()
>>> model1.train({'species': 'cat'}, 'animal')
>>> model1.train({'species': 'cat'}, 'animal')
>>> model1.train({'species': 'cat'}, 'animal')
>>> model1.train({'species': 'cat'}, 'feline')
>>> model1.count
4
>>> model1.score({'species': 'cat'})
[('animal', 0.75), ('feline', 0.25)]

Classifying multiple things

>>> model2 = bayesian_classifier()
>>> model2.train({'species': 'cat'}, 'animal')
>>> model2.train({'species': 'dog'}, 'animal')
>>> model2.train({'species': 'dog'}, 'animal')
>>> model2.train({'species': 'cat'}, 'chair')
>>> model2.score({'species': 'cat'})
[('chair', 0.25), ('animal', 0.25)]
>>> model2.score({'species': 'dog'})
[('animal', 0.5), ('chair', 0)]

Testing multiple properties

>>> model3 = bayesian_classifier()
>>> model3.train({'species': 'cat'}, 'animal')
>>> model3.train({'species': 'cat'}, 'animal')
>>> model3.train({'species': 'cat'}, 'animal')
>>> model3.train({'species': 'cat'}, 'chair')
>>> model3.train({'species': 'cat', 'color': 'white'}, 'chair')
>>> model3.score({'color': 'white'})
[('chair', 0.2), ('animal', 0)]

>>> mod = bayesian_classifier()
>>> mod.score({'color': 'purple'}) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
RuntimeError: The model has not been trained yet. Train the model ... item.
"""

def __init__(self):
# number of items the model trained on
self.count = 0
# storage for all property and category keys and values
self.store = {}

[docs]    def train(self, item, category):
"""
The method to train the instance of the Bayesian classifier
on an item.

Args:
item: A dict of property-value pairs in the form {property_1: value1,
property2: value2, . . .} for the item.  category: A string of the
category of the item.

Returns:
null
"""

# if the data store doesn't contain this category,
# create a new key with the value being an empty
# dictionary
if category not in self.store:
self.store[category] = {}

# iterate through key/property-value pairs in item
for item_key in item:
item_value = item[item_key]

# if it's the first time the model sees this property for this category,
# add a dictionary for it
if item_key not in self.store[category]:
self.store[category][item_key] = {}

# if this is the first time the model sees a value for this property + category,
# add an entry in the dict for that property and set value to 0
if item_value not in self.store[category][item_key]:
self.store[category][item_key][item_value] = 0

# increment the value tied to the property + category
self.store[category][item_key][item_value] += 1

# increment count of trained items
self.count += 1

[docs]    def score(self, item):
"""
Scores a certain item based on the learning the model has done
on previous items.

Args:
item: A dict in the form {property: value} of the item you want to score.

Returns:
A list containing tuples of properties and scores. The list is ordered
in descending order of scores.
"""

if self.store == {}:
raise RuntimeError('The model has not been trained yet. Train the '
'model before trying to score an item.')

# iterate through each key in the item to be scored
# and then iterate through each category used in previous
# .train() calls

odds = {}
odds_sums = {}

# iterate over properties and their values in the item to score
for item_key in item:
# item_key is the property (e.g., color), item_value is value of property (e.g., white)
item_value = item[item_key]

# iterate over all categories that the model has trained on in the past
for category in self.store:
# if we haven't added this category to our odds dict, add it with a dict as value
if category not in odds:
odds[category] = {}

if item_key in self.store[category]:
# if the model has seen this value for this property for this category before
# return the number of times divided by total training trials
# we get a score for each unique property+value combination within a category
if item_value in self.store[category][item_key]:
odds[category][item_key + '_' + item_value] = \
self.store[category][item_key][item_value] / self.count

# otherwise, mark it zero.
else:
odds[category][item_key + '_' + item_value] = 0
# MARK IT ZERO.
else:
odds[category][item_key + '_' + item_value] = 0

# iterate over all categories in odds dict and sum up prob values
# for unique property+value combinations
for category in odds:
for combination in odds[category]:
if combination not in odds_sums:
odds_sums[category] = 0

odds_sums[category] += odds[category][combination]

# return a list of properties and scores in descending order of scores
return(sorted(odds_sums.items(), key=lambda x: x[1], reverse=True))