<center style="font-size: 32px; font-weight: bold;" >N-граммы</center>

# Библиотека nltk

https://www.nltk.org/api/nltk.lm.html

## Словарь

Надстройка над  ```Counter()```. Его экземпляр в ```vocab.counts```.<br>
Заменяет редкие слова (параметр ```unk_cutoff```) на ```<UNK>```


In [None]:
from nltk.lm import Vocabulary

words = ['a', 'b', 'r', 'a', 'c', 'a', 'd', 'a', 'b', 'r', 'a']
vocab = Vocabulary(words, unk_cutoff=2)

print(vocab['a'], 'a' in vocab)       # 5  True
print(vocab['c'], 'c' in vocab)       # 1  False (unk_cutoff=2)
print(vocab['Z'], 'Z' in vocab)       # 0  False 

vocab.update(["c", "c"])              # добавили слов
print(vocab['c'], 'c' in vocab)       # 3  True

print( vocab )                        # <Vocabulary with cutoff=2 unk_label='<UNK>' and 5 items>
print( len(vocab) )                   # 5 число слов включая токен '<UNK>'
print( sorted(vocab) )                # ['<UNK>', 'a', 'b', 'c', 'r']   что считает словарём
print( sorted(vocab.counts) )         # ['a', 'b', 'c', 'd', 'r']  всё, что получила

vocab.lookup("a")                     # 'a'
vocab.lookup(["a", "aliens"])         # ('a', '<UNK>')
vocab.lookup(["a", "b", ["x", "b"]])  # ('a', 'b', ('<UNK>', 'b'))


[ (v, vocab[v])  for v in vocab] 

## n-gramms

In [None]:
from nltk.util import ngrams, bigrams
    
print( list(ngrams([1,2,3,4,5], 1)) )          # [(1, 2, 3), (2, 3, 4), (3, 4, 5)]    
print( list(bigrams([1,2,3,4,5]  ) ) )         # [(1, 2), (2, 3), (3, 4), (4, 5)]  
    


In [None]:
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline

print( list( pad_both_ends(['a','b'], n=2) ) ) # ['<s>', 'a', 'b', '</s>']
print( list( pad_both_ends(['a','b'], n=3) ) ) # ['<s>', '<s>', 'a', 'b', '</s>', '</s>']

print (list( bigrams( pad_both_ends(['a','b'], n=2) ) ) )

In [None]:
sents = [['a', 'b'], ['b', 'a'] ]    # 2 "предложения"
train, vocab = padded_everygram_pipeline(2, sents)
print( list(vocab) )

## Условные вероятности

In [None]:
from nltk.lm import MLE, WittenBellInterpolated

sents = [['a','b','c'], ['b', 'a'] ]      # 2 "предложения"
train, vocab = padded_everygram_pipeline(2, sents)

#print(list(list(train)[2]))

lm = MLE(2)                              # работаем с биграммами

lm.fit(train, vocab)
print(lm.counts.N())                     # 16 число n-грамм, n=1,2 
print(list(lm.counts.unigrams))
print(sorted(lm.counts[['a']].items()) )

#lm.fit([[("a", "b"), ("b", "c")]],  ['a', 'b', 'c'])

print(lm.vocab )
print(sorted(lm.vocab) )

print(lm.counts)
print(lm.score)
print(lm.counts['a'],        lm.score("a"), 2/9)
print(lm.counts[['a']]['b'], lm.score("b", ["a"]))  # a => b

In [None]:
#test = [('a', 'b'), ('b', 'a'), ('a', 'b')]

test = [('a', 'b'), ('a', 'c')]

print(lm.score('c', ['a']))
print("e=", lm.entropy(test) )                           # 1.0
print("p=", lm.perplexity(test), 2**lm.entropy(test) )   # 2.0   2.0

print( [ lm.score(b[-1], b[:-1] )  for b in test] )      # [0.5, 0.5, 0.5]

print("e=", sum( [ -lm.logscore(ng[-1], ng[:-1]) for ng in  test] )/len(test) )
print( sum( [  -lm.score(w)*lm.logscore(w) for w in  lm.vocab if w != '<UNK>'] ) )

w = test[0]

print(lm.logscore(w[-1], list(w[:-1])))


In [None]:
print( lm.generate(5, random_seed=3) )
print( lm.generate(4, text_seed=['</s>'], random_seed=3) )

print(sorted(lm.counts[['</s>']].items()) )
print(lm.context_counts(lm.vocab.lookup(['</s>'])))

## Языковые модели

In [None]:
from nltk.lm.api import LanguageModel

class Backoff(LanguageModel):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)        

    def unmasked_score(self, word, context=None):
        for i in range(len(context)+1):    
            counts = self.context_counts(context[i:])
            word_count = counts[word]
            if word_count > 0:
                return word_count / counts.N()
        return 0                 
    
class Smooth(LanguageModel):

    def __init__(self, beta, *args, **kwargs):
        super().__init__(*args, **kwargs)       
        self.beta = beta

    def unmasked_score(self, word, context=None):
        if not context:
            counts = self.context_counts(context)
            return counts[word] / counts.N() if counts.N() > 0 else 0
        
        prob, norm, coef = 0, 0, 1;
        for i in range(len(context)+1):    
            counts = self.context_counts(context[i:])            
            if counts.N() > 0:
                prob  += coef * ( counts[word] / counts.N() )
                norm  += coef       
            coef  *= self.beta                      
        return prob/norm                       

In [None]:
from nltk.lm import MLE, Laplace, WittenBellInterpolated, KneserNeyInterpolated, Lidstone

sents = [['a','b','c'], ['b', 'a'] ]      # 2 "предложения"

train, vocab = padded_everygram_pipeline(2, sents)


#lm = MLE(2)
lm = Laplace(2)
#lm = WittenBellInterpolated(2)
#lm =KneserNeyInterpolated(2)
#lm = Lidstone(0.5, 2)

#lm = Backoff(2)

#lm = Smooth(0.5, 2)

lm.fit(train, vocab)

print(len(lm.vocab))

test = [('a', 'b'), ('a', 'c')]

print("e=", lm.entropy(test) )   

p = [ lm.score(w, ['a']) for w in lm.vocab ] 
print(p)
print(sum( p ))

## OOV

In [None]:
sents = [['a', 'b', 'c'], ['b', 'a'] ]      # 2 "предложения"

vocab = Vocabulary(unk_cutoff=2)
for s in sents: 
    vocab.update(s)
sents = vocab.lookup(sents)               # (('a', 'b', '<UNK>'), ('b', 'a'))
print(sents)

train, vocab = padded_everygram_pipeline(2, sents)

lm = MLE(2)
lm.fit(train, vocab)

print(lm.counts.N())

print(list(lm.vocab))
print(lm.counts['<UNK>'])
print(lm.counts['a'])
print(lm.counts[['b']]['<UNK>'])

print("b=>a : ", lm.counts[['b']]['a'] )
print("b=>c : ", lm.counts[['b']]['c'] )

print(sorted(lm.counts[['b']].items()) )

print( lm.score('b') )
print( lm.score('a',  ['b']) )

print( lm.score('<UNK>'))               # 0.11111 = 1/9
print( lm.score('<UNK>',  ['b'] ) )     # 0.5


#lm.vocab.lookup( [['b']])


# Библиотека my_ngrams


In [71]:
import my_ngrams
import importlib
importlib.reload(my_ngrams)                        

from  my_ngrams import NGramsCounter, Markov, MarkovLaplas  
from  my_ngrams import MarkovInterpolated, MarkovLaplasInterpolated, MarkovBackoff

## Пример для символов

In [None]:
counter = NGramsCounter(3)                      # будет 1,2,3 - грамм
counter.add('мама мыла раму')                   # подсчитываем n-граммы

#print("root:",counter.root)
print("размер словаря:", len(counter.root))     # 7
print("слова  словаря:", counter.branches())    # [('м', 4), ('а', 4), ...]
print(counter.ngrams[1:])                       # [14, 13, 12] число n-грамм n=1,2,3
print(counter.unique() )                        # [7, 10, 12] уникальных n-грамм

N1, N2, _ = counter.counts('м')                 # N1 - было 'м', N2 - длина текста 
print("N('м'), N       = ", N1, N2)             # 4, 14 
print("P('м')          = ", counter.prob('м'))  # 0.2857 = 4/14 = N1/N2

N1, N2, _ = counter.counts('ма')                # N1 - было 'ма', N2 - было 'м'
print("N('ма'),N('м')  = ", N1, N2)             # 2, 4 
print("P('м=>а')       = ", counter.prob('ма')) # 0.5 = 2/4 = N1/N2

print("P(ма=>м) = ", counter.prob('мам'))       # 0.5     условная 
print("P(мам)   = ", counter.prob('мам', False))# 0.083 = 1/12 совместная

print(counter.branches('ма'))                   # [('м', 1), (' ', 1)] что после ма

branches = counter.all_branches()               # [(['м', 'а', 'м'], 1), (['м', 'а', ' '], 1) все ветки
print(branches)

N1, N2, _ = counter.counts('у')                 # N1 - было 'м', N2 - длина текста 
print("N('м'), N       = ", N1, N2)             # 4, 14 


In [None]:
#lm = MarkovLaplas(beta=1, order=3, counter=counter)
#lm = Markov(order=3, counter=counter)
lm = MarkovInterpolated(beta=0.5, order=3, counter=counter)

print(lm.counter.prob("ру"))                    #  0    
print(lm.prob("му"))                            #  0.125 = (0+1)/(1+7) 

print(lm.perplexity("мама умыла раму"))         # 4.886

probs = [ lm.prob('ма' + w) for w,_ in lm.counter.branches() ]
print(sum(probs), probs)
print(f"|{lm.generate('ма', 10)}|")

In [None]:
( lm.prob('ум'), 4/14)
#[ (w, lm.prob('у'+w), counter.prob(w)) for w,_ in lm.counter.branches()]


In [None]:
[0]*5

## Пример для слов

In [None]:
counter = NGramsCounter(3)
counter.add(['мама','мыла','раму','мама','мыла','папу','EOS','EOS'])
print(counter.ngrams[1:])
print(counter.prob(['мама','мыла','раму']))
print(counter.prob(['мама','мыла','папу']))
print(counter.branches())
print(counter.branches(['мама','мыла']))
print(counter.branches(['папу']))

# Читаем текстовый файл и составляем словарь
## Русские тексты

In [None]:
import numpy as np
import zipfile
import re
import math
import time

def preprocess(s):
    """ Возвращаем строку s из которой выкинут мусор, двойные пробелы"""
    s = s.translate( {ord(c): ' ' for c in "\u202f\u200b\xa0"} ) # пробелы
    s = s.translate( {ord(c): ' ' for c in "\"«»"} )             # кавычки
    s = re.sub( '\s+', ' ', s).strip()                           # двойные пробелы 
    s = s.lower()                                                # в нижнем регистре    
    
    # одиночные пробелы перед знаками припинания (отделяем от слова):
    no_space = lambda char, prev: ( char in (',', '!','?' ,'.', '…', ':', ';') and prev != ' ' )
    out = [ ' '+ char if   i > 0 and no_space(char, s[i-1]) else char 
                 for i, char in enumerate(s) ]
    
    return (' '+"".join(out) + ' ')  # пробелы для поиска целых слов: " girl "

def preprocess1(txt):    
    """ Чистим текст txt от мусора """
    txt = txt.translate( {ord(c): ' ' for c in "\u202f\u200b\xa0"} ) # UTF-8 пробелы
    txt = txt.translate( {ord(c): ' ' for c in "\"\'…—–-«»xe"} )     # прочие
    txt = txt.translate( {769: ' '} )                                # странный '
    txt = re.sub( '\s+\.', '.', txt )                                # перед точкой убр. пробел
    txt = re.sub( '\s+', ' ', txt ).strip()                          # двойные пробелы
    return txt.lower()                                               # в нижний регистр

def preprocess2(txt, abc = " абвгдежзийклмнопрстуфхцчшщъыьэюя"):
    """ Буквы не из алфавита заменяем пробелами """
    txt = txt.lower()
    txt = txt.translate( {ord('ё'): 'е'} )                            # "ё" -> 'е' :(
    res = ""
    for ch in txt:
        res += ( ch if ch in abc else ' ')
    return re.sub( '\s+', ' ', res ).strip()                          # двойные пробелы
            
    
text     = ""      # все тексты
text_trn = ""      # тренировочные от начала до -tst_len
text_tst = ""      # тестовые - последние tst_len (20%) букв

with zipfile.ZipFile("C:/!/Data/nlp/books/books.zip") as myzip:
    for fname in myzip.namelist():
        print(fname, end=": ")
        with myzip.open(fname) as myfile:
            st = myfile.read().decode("utf-8")
            text += st
            
            tst_len  = int(len(st)*0.2)            # число букв для тренировки в каждой книге 20%
            text_trn += st[:-tst_len]
            text_tst += st[-tst_len:]
            
            st = preprocess(st).split()            
            #print(st[:500])
            print(len(st), "words")

text     = preprocess2(text)                        # чистим текст от мусора
print("len(text):    ", len(text))
text_trn = preprocess2(text_trn)
print("len(text_trn):", len(text_trn))
text_tst = preprocess2(text_tst)
print("len(text_tst):", len(text_tst))

chars = sorted(set( [c for c in text] ))            # все символы текста (словарь)
ch2id = dict( (c,i) for i,c in enumerate(chars))    # таблица перевода char -> index
id2ch = dict( (i,c) for i,c in enumerate(chars))    # таблица перевода index -> char

print(len(chars), "\n", chars, "\n", len(text), "\n", text[:500])    

## Английские тексты ROCStories

In [56]:
%%time
import numpy as np
import re
import pandas as pd
from   collections import Counter

import my_nlp                         # см. файл my_npl.py
importlib.reload(my_nlp)                              

df = pd.read_csv('C:/!/Data/nlp/ROCStories/100KStories.zip', sep=',')  

docs = []
for i in range( len(df) ):
    sents = []
    for j in range(5):
        sents.append( my_nlp.preprocess( df.iloc[i,2+j]) )   
    docs.append(sents)
        
print("len(docs):", len(docs))  

len(docs): 98167
Wall time: 17.2 s


### Буквы

In [58]:
text, text_trn, text_tst = "", "", ""
for d in docs:
    text     += "".join(d)
    text_trn += "".join(d[:4])
    text_tst += "".join(d[4])
            
print(docs[:2])
print(len(text),text[:300])
print(len(text_trn),text_trn[:300])
print(len(text_tst), text_tst[:300])

[[" dan 's parents were overweight . ", ' dan was overweight as well . ', ' the doctors told his parents it was unhealthy . ', ' his parents understood and decided to make a change . ', ' they got themselves and dan on a diet . '], [' carrie had just learned how to ride a bike . ', " she didn't have a bike of her own . ", " carrie would sneak rides on her sister 's bike . ", ' she got nervous on a hill and crashed into a wall . ', ' the bike frame bent and carrie got a deep gash on her leg . ']]
23583631  dan 's parents were overweight .  dan was overweight as well .  the doctors told his parents it was unhealthy .  his parents understood and decided to make a change .  they got themselves and dan on a diet .  carrie had just learned how to ride a bike .  she didn't have a bike of her own .  carrie 
18550869  dan 's parents were overweight .  dan was overweight as well .  the doctors told his parents it was unhealthy .  his parents understood and decided to make a change .  carrie had 

### Слова

In [57]:
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline
 
text, text_trn, text_tst = [], [], []
for d in docs[:10]:
    for i, s in enumerate(d):
        text.append( list( pad_both_ends(s.split(), n=2) ) ) 
        if i < 4: text_trn += s
        else:     text_tst += s    
    
    
#docs = list( pad_both_ends(docs, n=2) ) 
#print(docs[:10]) 
text

[['<s>', 'dan', "'s", 'parents', 'were', 'overweight', '.', '</s>'],
 ['<s>', 'dan', 'was', 'overweight', 'as', 'well', '.', '</s>'],
 ['<s>',
  'the',
  'doctors',
  'told',
  'his',
  'parents',
  'it',
  'was',
  'unhealthy',
  '.',
  '</s>'],
 ['<s>',
  'his',
  'parents',
  'understood',
  'and',
  'decided',
  'to',
  'make',
  'a',
  'change',
  '.',
  '</s>'],
 ['<s>',
  'they',
  'got',
  'themselves',
  'and',
  'dan',
  'on',
  'a',
  'diet',
  '.',
  '</s>'],
 ['<s>',
  'carrie',
  'had',
  'just',
  'learned',
  'how',
  'to',
  'ride',
  'a',
  'bike',
  '.',
  '</s>'],
 ['<s>',
  'she',
  "didn't",
  'have',
  'a',
  'bike',
  'of',
  'her',
  'own',
  '.',
  '</s>'],
 ['<s>',
  'carrie',
  'would',
  'sneak',
  'rides',
  'on',
  'her',
  'sister',
  "'s",
  'bike',
  '.',
  '</s>'],
 ['<s>',
  'she',
  'got',
  'nervous',
  'on',
  'a',
  'hill',
  'and',
  'crashed',
  'into',
  'a',
  'wall',
  '.',
  '</s>'],
 ['<s>',
  'the',
  'bike',
  'frame',
  'bent',
  'and',

# Эксперименты

In [None]:
%%time
from nltk.lm import MLE, Laplace, WittenBellInterpolated, KneserNeyInterpolated, Lidstone

n = 5

#train, vocab = padded_everygram_pipeline(n, [chars])

#lm = MLE(n)
#lm = Laplace(n)
#lm = Lidstone(0.5, n)
#lm = WittenBellInterpolated(n)
#lm =KneserNeyInterpolated(n)
#lm = Backoff(n)
lm = Smooth(0.5, n)

lm.fit( [ ngrams(chars,1) ], chars)    # юниграммы и словарь
for i in range(2, n+1):
    lm.fit( [ ngrams(chars, i) ] )     # следующие n-граммы
    print(f"n:{i:2d}> ngrams:{lm.counts.N():10d}")

#lm.fit(train, vocab)

print(len(lm.vocab))
print(list(lm.vocab))

In [None]:
print(lm.score('о', ['р']))
print(lm.counts.N())    

In [None]:
from operator import itemgetter
p=[ (w, lm.score(w)) for w in lm.vocab ] 
sorted(p, key=itemgetter(1), reverse=True)


In [None]:
#lm.score('б', ['ч','п','р'])
lm.counts[['ч','п','р']]['б']

In [None]:

probs = [ lm.score(w, ['ч','п','р']) for w in lm.vocab ] 
print([f"{p:.8f}" for p in probs])
print(sum( probs ))


In [None]:
%%time
cond= CondProb(5)
cond.add(text)

# Частотность букв при помощи Counter

In [None]:
%%time
from collections import Counter

counter = Counter()
for ch in  text:    
    counter[ch] += 1

print("VOC_SIZE:",len(counter.items()), "TXT_LEN:",sum(counter.values()))

for ch,num in counter.most_common(10000):
    print("%s, %d, %.5f" % (ch, num, num/len(text)))

# Энтропия 
## Энтропия букв

In [48]:
text     = preprocess2(text,     abc = " 'abcdefghijklmnopqrstuvwxyz")
text_trn = preprocess2(text_trn, abc = " 'abcdefghijklmnopqrstuvwxyz")
text_tst = preprocess2(text_tst, abc = " 'abcdefghijklmnopqrstuvwxyz")

In [46]:
%%time
counter = NGramsCounter(1)
print(len(text))
counter.add(text)

21943099
Wall time: 35.2 s                 37.6%


In [49]:
[ f"{w} {n/counter.ngrams[1]:.4f}" for w, n in counter.branches() ]

['  0.1957',
 'e 0.1055',
 't 0.0725',
 'a 0.0679',
 'o 0.0592',
 'h 0.0533',
 'n 0.0501',
 'i 0.0498',
 's 0.0486',
 'r 0.0451',
 'd 0.0415',
 'l 0.0311',
 'w 0.0215',
 'c 0.0196',
 'u 0.0189',
 'm 0.0188',
 'g 0.0179',
 'y 0.0163',
 'f 0.0157',
 'p 0.0140',
 'b 0.0124',
 'k 0.0100',
 'v 0.0072',
 'j 0.0025',
 "' 0.0022",
 'x 0.0012',
 'z 0.0008',
 'q 0.0004']

In [None]:
#lm = Markov(order=5, counter=counter)
#lm = MarkovLaplas(beta = 1,  order=5, counter=counter)
#lm = MarkovInterpolated(beta = 0.5,  order=5, counter=counter)
lm = MarkovLaplasInterpolated(beta=0.5, gamma=1,  order=5, counter=counter)
probs = [ lm.prob('ывфыфыв'+w)    for w,_ in counter.branches()]
print(sum(probs))
print([f"{p:.2}" for p in probs])

In [None]:
print(lm.perplexity(text))

## Реальный текст

In [50]:
%%time
print(len(text_trn))

counter = NGramsCounter(12)
counter.add(text_trn)

u = counter.unique()
print(u, sum(u))

17246125
[28, 695, 7614, 42774, 164101, 508195, 1171616, 2157428, 3449396, 4937135, 6502294, 8070359] 27011635
Wall time: 4min 56s


In [51]:
%%time
print(len(text_tst))
for i in range(1, 13):
    #lm = MarkovLaplas(beta = 1,  order=i, counter=counter)
    lm = MarkovInterpolated(beta=0.5, minN = 3, order=i, counter=counter)    
    print("%d %.2f" % (i,  lm.perplexity(text_tst)))   

4696973
1 17.07
2 10.79
3 7.12
4 5.02
5 3.96
6 3.44
7 3.16
8 3.02
9 2.97
10 2.97
11 3.01
12 3.06
Wall time: 30min 19s


In [None]:
lm = MarkovInterpolated(beta=0.5, minN=1, order=9, counter=counter)    
"".join(lm.generate("три девицы под окном пряли поздно вечерко", 100))


In [None]:

print(lm.prob("а заволжц"))
N1, N2, _ = counter.counts("жц")
print(N1,N2)

In [None]:
lm = MarkovInterpolated(beta=0.1,  order=1, counter=counter)    
"".join( lm.generate('ма', 100) )

In [None]:
print(len(text_trn))
for i in range(1,6):
    counter = NGramsCounter(i)    
    counter.add(text_trn)
    u = counter.unique()
    print(i, u, sum(u))

## Энтропия слов

In [59]:
from nltk.lm import Vocabulary

words_trn = text_trn.split()
print(len(words_trn))
vocab = Vocabulary(words_trn, unk_cutoff=10)
print(vocab)

words_trn = vocab.lookup(words_trn)    

3825715
<Vocabulary with cutoff=10 unk_label='<UNK>' and 10180 items>


In [62]:
print("words_trn:", len(words_trn))

counter = NGramsCounter(7)
counter.add(words_trn)

print("размер словаря:", len(counter.root))       
print("слова  словаря:", counter.branches()[:100])  
print(counter.ngrams[1:])                       # [14, 13, 12] число n-грамм n=1,2,3
print(counter.unique() )                        # [7, 10, 12] уникальных n-грамм

words_trn: 3825715
размер словаря: 10180             
слова  словаря: [('.', 384646), ('the', 163040), ('to', 137617), ('a', 117446), ('was', 93231), ('he', 90811), ('she', 75712), ('and', 67283), ('<UNK>', 62820), ('her', 56950), (',', 51501), ('his', 51038), ('it', 44984), ('in', 41745), ('i', 40686), ('of', 37634), ('for', 35041), ('had', 33010), ('on', 29896), ('they', 23806), ('at', 22233), ('with', 19446), ('one', 19008), ('my', 18136), ('up', 17745), ("'s", 17733), ('went', 17589), ('out', 17231), ('day', 16779), ('him', 16464), ('that', 16362), ('but', 15087), ('when', 14861), ('decided', 14489), ('got', 14445), ('wanted', 13986), ('were', 12970), ('all', 12693), ('an', 10538), ('so', 10466), ('new', 10214), ('as', 9735), ('very', 9561), ('them', 9184), ('would', 8549), ('from', 8546), ('not', 8331), ('get', 8150), ('we', 8073), ('there', 8023), ('some', 7818), ('go', 7768), ('took', 7694), ("didn't", 7519), ('friends', 7510), ('about', 7399), ('school', 7357), ('home', 7327), 

In [64]:
%%time
words_tst = vocab.lookup(text_tst.split())    

print(len(words_tst))
for i in range(1, 8):
    #lm = MarkovLaplas(beta = 1,  order=i, counter=counter)
    lm = MarkovInterpolated(beta=1, minN = 3, order=i, counter=counter)    
    print("%d %.2f" % (i,  lm.perplexity(words_tst)))   

1033901
1 479.75
2 156.73
3 117.26
4 112.24
5 112.22
6 112.30
7 112.32
Wall time: 1min 1s


In [72]:
print("слова  словаря:", counter.branches()[:10])
lm = MarkovInterpolated(beta=1, minN = 3, order=5, counter=counter)    
lm.generate(["the"], 100, insert=" ")

слова  словаря: [('.', 384646), ('the', 163040), ('to', 137617), ('a', 117446), ('was', 93231), ('he', 90811), ('she', 75712), ('and', 67283), ('<UNK>', 62820), ('her', 56950)]


"<UNK> to war the for . when my . she day marissa . he warren pretty latest shirts light planned on comes was was possible . decided was . the cigarettes aline himself in work . as winter the however . looking she friends was would twins got anyone , also , the was was moved said . hannah to 's free always . and afraid money for was has has has would on seek were couldn't away on cheese then front shirt shirts was pictures was kind won as was years free the warren shake . my i got "

In [75]:
N1,N2, _ = counter.counts(["for"])
print(N1,N2)
N1,N2, _ = counter.counts(["the", "for"])
print(N1,N2)

35041 3825715
1 163040


# Предсказания 
## Для букв

In [None]:
print(len(text_trn))
for i in range(1, 10):
    markov = Markov(i, 0.5, backoff=False)
    markov.fit(text_trn)
    st = "три девицы под окном пряли поздно вечерко"    
    for k in range(100):
        st += markov.get_char(st)
    print(i, st)

## Для слов

In [None]:
lst = text.split()
print(len(lst))

markov = Markov(4, 0.5)
markov.fit(lst)

In [None]:
#start = ["я", "не", "знаю", "этого"]

start = ["на", "глазах", "у", "многочисленных", "свидетелей"]

for num in range(100):
    w = markov.get_next(start)
    print(w, end=' ')
    start.append(w)


#  Перплексия 
## Для букв

In [None]:
for i in range(1, 13):
    markov = Markov(i, 0.9, backoff=True)
    markov.fit(text_trn)
    print("%d %.2f" % (i,markov.evaluate(text_tst)))   

## Для слов

In [None]:
for i in range(1, 5):
    markov = Markov(i, 0.99)
    markov.fit(text_trn.split())
    print("%d %.2f" % (i,markov.evaluate(text_tst.split())))   

In [None]:
import torch

r = torch.cat ( [torch.arange(6).view(1,2,3).float(), torch.ones(1,2,3), torch.zeros(1,2,3), torch.ones(1,2,3) ], dim=0)
print(r.shape)
print(r)