朴素贝叶斯分类算法

这是一次作业的笔记

要求及数据集

https://www.kaggle.com/c/naivebayes-21/overview
The overall goal of the exercise is to get hands-on experience with the implementation of a popular machine learning scheme and to work on a real-world task. The task is to implement an improved version of the Naive Bayes algorithm that is able to predict the domain - one of Archaea, Bacteria, Eukaryota or Virus - from the abstract of research papers about proteins taken from the MEDLINE database. You will then apply your implementation on a test set without class labels and hand in the predictions of your implementation.

思路

朴素贝叶斯 + 拉普拉平滑 + 禁用词

具体实践

import pandas as pd
import math

commonWords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself',
               'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
               'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these',
               'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
               'does', 'did', 'doing', 'a', 'an',
               'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
               'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
               'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
               'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
               'other', 'some', 'such', 'no', 'nor',
               'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don',
               'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn',
               'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won',
               'wouldn']

class myNaiveBayesAbstract:
    def __init__(self, trainingSetName: str, labelName: str, abstractName: str):
        print('init trainingSet')
        self.trainingSet = self.readTrainingSet(trainingSetName)
        self.trainingLabels = self.trainingSet[labelName]
        self.trainingAbstract = self.trainingSet[abstractName]
        self.trainingSize = len(self.trainingSet)

        print('init labelData')
        labelSet = list(set(self.trainingLabels))
        self.trLabelDic = {}
        for i in labelSet:
            self.trLabelDic[i] = {}
            self.trLabelDic[i]['count'] = 0
            self.trLabelDic[i]['word'] = []

        print(self.trLabelDic)

    def isCommonWord(self, word):
        if word in commonWords:
            return True
        else:
            return False

    # read train set

    def readTrainingSet(self, name: str):
        print('readTrainSet: ', name)
        return pd.read_csv(name)

    def collectAbstract(self, words, label):
        if label in self.trLabelDic:
            self.trLabelDic[label]['count'] += 1
            self.trLabelDic[label]['word'] += words
        else:
            print("error can't find " + label + " in trLabelDic.")

    def calculateProbability(self, testData):
        tempDic = {}
        for key in self.trLabelDic.keys():
            tempDic[key] = {}

        # print('tempDic', tempDic)
        # print(len(tempDic))

        for key in self.trLabelDic.keys():
            wordsCollection = self.trLabelDic[key]['word']

            # print(len(wordsCollection))
            for word in wordsCollection:
                # print(len(word))
                # print('word: ', word)
                if not self.isCommonWord(word):
                    if word in tempDic[key]:
                        tempDic[key][word] += 1
                    else:
                        tempDic[key][word] = 1

        # print(tempDic)

        # prepare to Laplacian Smoothing
        uniqueWords = []
        for key in self.trLabelDic.keys():
            for tempword in tempDic[key]:
                if tempword not in uniqueWords:
                    uniqueWords.append(tempword)
                # else:
                #     print(tempword)

        # print('uniqueWords: ', len(uniqueWords))

        testSize = len(testData)
        testUniSize = 0

        for i in range(0, testSize):
            words = testData[i].split(" ")
            for word in words:
                if not self.isCommonWord(word):
                    if word not in uniqueWords:
                        testUniSize += 1

        # print('testUniSize: ', testUniSize)

        totalUniCount = testUniSize + len(uniqueWords)

        testLabels = []

        for i in range(0, testSize):
            words = testData[i].split(" ")

            probability = {}

            for key in self.trLabelDic.keys():
                probability[key] = math.log(
                    self.trLabelDic[key]['count'] / self.trainingSize, 10)

            for word in words:
                if not self.isCommonWord(word):
                    # Laplacian Smoothing
                    for key in self.trLabelDic.keys():
                        totalCount = len(
                            self.trLabelDic[key]['word']) + totalUniCount

                        if word in tempDic[key]:
                            probability[key] = probability[key] + math.log(
                                (tempDic[key][word] + 1) / (totalCount), 10)
                        else:
                            probability[key] = probability[key] + math.log(
                                1 / (totalCount), 10)

            max_num = max(probability, key=probability.get)

            # print('max_num: ' + str(probability[max_num]))

            # max_num = max(pa, pb, pe, pv)
            for key in self.trLabelDic.keys():
                # print(probability[key])
                if probability[max_num] == probability[key]:
                    testLabels = testLabels + [key]
                    # print('label: ' + key)

        return testLabels

    def classify(self, testData):
        print('classify')

        testLabels = self.calculateProbability(testData)

        return testLabels

    def predict(self, abstracts):
        for i in range(0, self.trainingSize):
            words = self.trainingAbstract[i].split(' ')
            # print(words)

            tempLabel = self.trainingLabels[i]
            self.collectAbstract(words, tempLabel)

        # print(self.trLabelDic['A']['count'])
        # print(self.trLabelDic['B']['count'])
        # print(self.trLabelDic['E']['count'])
        # print(self.trLabelDic['V']['count'])

        return self.calculateProbability(abstracts)

naiveBayesAbstract = myNaiveBayesAbstract('trg.csv', 'class', 'abstract')

test_set = pd.read_csv("tst.csv")
test_set['class'] = naiveBayesAbstract.predict(test_set["abstract"])

test_set.drop(['abstract'], axis=1).to_csv('out.csv', index=False)
test_labels = list(set(test_set['class']))

for key in test_labels:
    print(key + ': ' + str(len(test_set[test_set['class'] == key])))