Source code for simplestatistics.statistics.bayesian_classifier
"""
Implements bayesian_classifier().
"""
# I need sane division that returns a float not int
from __future__ import division
[docs]class bayesian_classifier():
"""
A `naive Bayesian classifier`_.
The implemention of this classifier is very closely modeled after the implementation
in `simple-statistics <https://github.com/simple-statistics/simple-statistics>`_,
the javascript analogue of ``simplestatistics``. You can find the javascript implementation
of the Bayesian classifier `here <https://github.com/simple-statistics/\
simple-statistics/blob/master/src/bayesian_classifier.js>`_.
.. _`naive Bayesian classifier`: https://en.wikipedia.org/wiki/Naive_Bayes_classifier
Examples:
Making a seventy-five/twenty-five classification.
>>> model1 = bayesian_classifier()
>>> model1.train({'species': 'cat'}, 'animal')
>>> model1.train({'species': 'cat'}, 'animal')
>>> model1.train({'species': 'cat'}, 'animal')
>>> model1.train({'species': 'cat'}, 'feline')
>>> model1.count
4
>>> model1.score({'species': 'cat'})
[('animal', 0.75), ('feline', 0.25)]
Classifying multiple things
>>> model2 = bayesian_classifier()
>>> model2.train({'species': 'cat'}, 'animal')
>>> model2.train({'species': 'dog'}, 'animal')
>>> model2.train({'species': 'dog'}, 'animal')
>>> model2.train({'species': 'cat'}, 'chair')
>>> model2.score({'species': 'cat'})
[('chair', 0.25), ('animal', 0.25)]
>>> model2.score({'species': 'dog'})
[('animal', 0.5), ('chair', 0)]
Testing multiple properties
>>> model3 = bayesian_classifier()
>>> model3.train({'species': 'cat'}, 'animal')
>>> model3.train({'species': 'cat'}, 'animal')
>>> model3.train({'species': 'cat'}, 'animal')
>>> model3.train({'species': 'cat'}, 'chair')
>>> model3.train({'species': 'cat', 'color': 'white'}, 'chair')
>>> model3.score({'color': 'white'})
[('chair', 0.2), ('animal', 0)]
>>> mod = bayesian_classifier()
>>> mod.score({'color': 'purple'}) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
RuntimeError: The model has not been trained yet. Train the model ... item.
"""
def __init__(self):
# number of items the model trained on
self.count = 0
# storage for all property and category keys and values
self.store = {}
[docs] def train(self, item, category):
"""
The method to train the instance of the Bayesian classifier
on an item.
Args:
item: A dict of property-value pairs in the form {property_1: value1,
property2: value2, . . .} for the item. category: A string of the
category of the item.
Returns:
null
"""
# if the data store doesn't contain this category,
# create a new key with the value being an empty
# dictionary
if category not in self.store:
self.store[category] = {}
# iterate through key/property-value pairs in item
for item_key in item:
item_value = item[item_key]
# if it's the first time the model sees this property for this category,
# add a dictionary for it
if item_key not in self.store[category]:
self.store[category][item_key] = {}
# if this is the first time the model sees a value for this property + category,
# add an entry in the dict for that property and set value to 0
if item_value not in self.store[category][item_key]:
self.store[category][item_key][item_value] = 0
# increment the value tied to the property + category
self.store[category][item_key][item_value] += 1
# increment count of trained items
self.count += 1
[docs] def score(self, item):
"""
Scores a certain item based on the learning the model has done
on previous items.
Args:
item: A dict in the form {property: value} of the item you want to score.
Returns:
A list containing tuples of properties and scores. The list is ordered
in descending order of scores.
"""
if self.store == {}:
raise RuntimeError('The model has not been trained yet. Train the '
'model before trying to score an item.')
# iterate through each key in the item to be scored
# and then iterate through each category used in previous
# .train() calls
odds = {}
odds_sums = {}
# iterate over properties and their values in the item to score
for item_key in item:
# item_key is the property (e.g., color), item_value is value of property (e.g., white)
item_value = item[item_key]
# iterate over all categories that the model has trained on in the past
for category in self.store:
# if we haven't added this category to our odds dict, add it with a dict as value
if category not in odds:
odds[category] = {}
if item_key in self.store[category]:
# if the model has seen this value for this property for this category before
# return the number of times divided by total training trials
# we get a score for each unique property+value combination within a category
if item_value in self.store[category][item_key]:
odds[category][item_key + '_' + item_value] = \
self.store[category][item_key][item_value] / self.count
# otherwise, mark it zero.
else:
odds[category][item_key + '_' + item_value] = 0
# MARK IT ZERO.
else:
odds[category][item_key + '_' + item_value] = 0
# iterate over all categories in odds dict and sum up prob values
# for unique property+value combinations
for category in odds:
for combination in odds[category]:
if combination not in odds_sums:
odds_sums[category] = 0
odds_sums[category] += odds[category][combination]
# return a list of properties and scores in descending order of scores
return(sorted(odds_sums.items(), key=lambda x: x[1], reverse=True))