1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
| import queue import math corpus = r"""ppap ppap ppap I have a pen , I have an apple . ( Eh~ ) Apple pen ! I have a pen , I have a pineapple . ( Eh~ ) pineapple pen ! apple pen ~ pineapple pen ( Eh~ ) pen pineapple Apple pen ! pen pineapple Apple pen . you don't have an apple . I don't have a pen . he has a pineapple . he doesn't have an apple ! """ from collections import defaultdict
class LessThanWin(object): def __init__(self, s: list): self.present = s def size(self): return len(self.present) def hash(self): return ' '.join(self.present)
class NNode(object): def __init__(self, s: list): self.given = s[:-1] self.present = [s[-1]] def size(self): return len(self.present) def hash_given(self): return ' '.join(self.given) def hash(self): return ' '.join(self.given+self.present)
class Que(object): def __init__(self, maxsize: int): self.q = [] self.maxsize = maxsize self.size = 0 def put(self, any): self.size += 1 self.q.append(any) if self.size 《 self.maxsize: return LessThanWin(self.q) else: if self.size 》 self.maxsize: self.q = self.q[1:] self.size -= 1 return NNode(self.q)
class model_args(object): def __init__(self,counter_given,counter_present,counter_present_n_1,lenth,win): self.counter_given = counter_given self.counter_present_n_1 = counter_present_n_1 self.counter_present = counter_present self.char_len = lenth self.win = win self.sum_n_1 = sum( [v for key,v in counter_present_n_1.items()] )
def train(text:str,window:int): counter_present = defaultdict(int) counter_present_n_1 = defaultdict(int) counter_given = defaultdict(int) all_char =set()
for line in corpus.split('\n'): chars = line.strip().split() q = Que(window) for ch in chars: all_char.add(ch) node = q.put(ch) if type(node).__name__ =='LessThanWin':
if node.size() == window-1: counter_present_n_1[node.hash()]+=1; else: counter_given[node.hash_given()]+=1 counter_present[node.hash()]+=1;
return model_args(counter_given,counter_present,counter_present_n_1,len(all_char),window)
def predict(s:str,model:model_args): counter_given,counter_present,counter_present_n_1,charV,window = model.counter_given,model.counter_present,model.counter_present_n_1,model.char_len,model.win n_1_all= 0; chars = s.split() q = Que(window) p = 0 for ch in chars: node = q.put(ch) if type(node).__name__ =='LessThanWin': if node.size() == window-1: p += math.log10( (counter_present_n_1[node.hash()]+1.0)/(model.sum_n_1+charV) ) ; else: p += math.log10( (counter_present[node.hash()]+1.0) /(counter_given[node.hash_given()]+charV) )
return p;
if __name__ == "__main__":
test_list=[ 'I have a pen .', 'you have a pen .', 'I am a pen .', 'he are a apple .', 'he has a pen .', 'ppap ppap ppap'] win = 3 model = train(corpus,win)
for sentence in test_list: p = predict(sentence,model) print(sentence,' log10(p)=',p)
|