In [1]:
from glob import glob
In [2]:
# https://github.com/borh/aozora-corpus-generatorを用いて作成した
filenames = [filename for filename in glob('/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/*.txt')]
In [3]:
len(filenames)
Out[3]:
5138
In [4]:
filenames[0:10]
Out[4]:
['/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Miyamoto_Y__Doukan.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Yoshida_K_Makenaishi.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Sugawa_K_Mujintou.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Satomura_K_Ku_Ri_Ga.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Matsumura_M_Ryoushi.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Mori_O_Kanzanji.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Kunieda_S_Futarimachi.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Terada_T_Natsu.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Toyoshima_Y_Doutei.txt',
 '/data/Japanese-Aozora-Bunko-2018-01-18-CWJ-orth/Tokenized/Hayashi_F_Kaneia.txt']
In [5]:
def corpus_to_tokens(file_list):
    '''コーパスから文ごとの形態素のリストを返す。'''
    for filename in file_list:
        with open(filename) as f:
            text = f.read()
            tokens = text.splitlines()
            
            sentence = []
            for token in tokens:
                if token == '<PGB>':
                    pass
                elif token == '<EOS>':
                    yield sentence
                    sentence = []
                else:
                    sentence.append(token)
In [6]:
sentences = [s for s in corpus_to_tokens(filenames)]
In [7]:
len([token for sentence in sentences for token in sentence])
Out[7]:
71281640
In [8]:
from gensim.models.word2vec import Word2Vec
In [9]:
model = Word2Vec(sentences, size=100, window=8, min_count=5, workers=10)
In [10]:
model.wv.most_similar('家')
Out[10]:
[('借家', 0.7052583694458008),
 ('生家', 0.6800761222839355),
 ('邸', 0.665619969367981),
 ('実家', 0.6504373550415039),
 ('別荘', 0.6474984884262085),
 ('長屋', 0.64559006690979),
 ('屋敷', 0.6399312615394592),
 ('一家', 0.638443112373352),
 ('住居', 0.6369857788085938),
 ('アパート', 0.6230219602584839)]
In [11]:
model.wv.doesnt_match(['学校', '大学', '下宿', '野菜'])
Out[11]:
'野菜'
In [12]:
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
In [13]:
def corpus_to_bow(file_list, window=2000):
    '''コーパスから文章ごとの形態素のリストを返す。windowを指定することで、その数で全語を分割して返す。'''
    for filename in file_list:
        with open(filename) as f:
            text = f.read()
            tokens = [token for token in text.splitlines()
                      if token != '<PGB>' and token != '<EOS>']
            if not window:
                yield tokens
            else:
                for i in range(len(tokens) // window):
                    start = i*window
                    stop = start + window if start + window <= len(tokens) else len(tokens)
                    yield tokens[start:stop]
In [14]:
len(list(map(len, corpus_to_bow(filenames))))
Out[14]:
33306
In [ ]:
corpus = list(corpus_to_bow(filenames))

dic = Dictionary(corpus)
dic.filter_extremes(no_below=5, no_above=0.4, keep_n=None)
dic.compactify()

corpus_gensim = [dic.doc2bow(doc) for doc in corpus]
In [16]:
lda_model = LdaModel(corpus_gensim, num_topics=50, iterations=200, alpha='auto', passes=15)
In [17]:
lda_model.show_topics();
In [18]:
import pyLDAvis.gensim
In [19]:
v = pyLDAvis.gensim.prepare(lda_model, corpus_gensim, dic)
/opt/conda/lib/python3.6/site-packages/pyLDAvis/_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
In [20]:
pyLDAvis.display(v)
Out[20]:
In [21]:
dic[1]
Out[21]:
'示唆'
In [22]:
import pandas as pd
top_topics = lda_model.show_topics(num_topics=50, num_words=5, formatted=False)
top_topics = [[topic_id, ','.join([dic[morpheme] if morpheme in dic else 'OOV'
                                   for morpheme, prob in dist])]
              for topic_id, dist in top_topics]
df = pd.DataFrame(top_topics, columns=['Topic', 'Top words'])
pd.set_option('max_colwidth', 20)
print(df.to_latex(index=False))
\begin{tabular}{rl}
\toprule
 Topic &            Top words \\
\midrule
     0 &  OOV,OOV,OOV,OOV,OOV \\
     1 &  OOV,OOV,OOV,OOV,OOV \\
     2 &  OOV,OOV,OOV,OOV,OOV \\
     3 &  OOV,OOV,OOV,OOV,OOV \\
     4 &  OOV,OOV,OOV,OOV,OOV \\
     5 &  OOV,OOV,OOV,OOV,OOV \\
     6 &  OOV,OOV,OOV,OOV,OOV \\
     7 &  OOV,OOV,OOV,OOV,OOV \\
     8 &  OOV,OOV,OOV,OOV,OOV \\
     9 &  OOV,OOV,OOV,OOV,OOV \\
    10 &  OOV,OOV,OOV,OOV,OOV \\
    11 &  OOV,OOV,OOV,OOV,OOV \\
    12 &  OOV,OOV,OOV,OOV,OOV \\
    13 &  OOV,OOV,OOV,OOV,OOV \\
    14 &  OOV,OOV,OOV,OOV,OOV \\
    15 &  OOV,OOV,OOV,OOV,OOV \\
    16 &  OOV,OOV,OOV,OOV,OOV \\
    17 &  OOV,OOV,OOV,OOV,OOV \\
    18 &  OOV,OOV,OOV,OOV,OOV \\
    19 &  OOV,OOV,OOV,OOV,OOV \\
    20 &  OOV,OOV,OOV,OOV,OOV \\
    21 &  OOV,OOV,OOV,OOV,OOV \\
    22 &  OOV,OOV,OOV,OOV,OOV \\
    23 &  OOV,OOV,OOV,OOV,OOV \\
    24 &  OOV,OOV,OOV,OOV,OOV \\
    25 &  OOV,OOV,OOV,OOV,OOV \\
    26 &  OOV,OOV,OOV,OOV,OOV \\
    27 &  OOV,OOV,OOV,OOV,OOV \\
    28 &  OOV,OOV,OOV,OOV,OOV \\
    29 &  OOV,OOV,OOV,OOV,OOV \\
    30 &  OOV,OOV,OOV,OOV,OOV \\
    31 &  OOV,OOV,OOV,OOV,OOV \\
    32 &  OOV,OOV,OOV,OOV,OOV \\
    33 &  OOV,OOV,OOV,OOV,OOV \\
    34 &  OOV,OOV,OOV,OOV,OOV \\
    35 &  OOV,OOV,OOV,OOV,OOV \\
    36 &  OOV,OOV,OOV,OOV,OOV \\
    37 &  OOV,OOV,OOV,OOV,OOV \\
    38 &  OOV,OOV,OOV,OOV,OOV \\
    39 &  OOV,OOV,OOV,OOV,OOV \\
    40 &  OOV,OOV,OOV,OOV,OOV \\
    41 &  OOV,OOV,OOV,OOV,OOV \\
    42 &  OOV,OOV,OOV,OOV,OOV \\
    43 &  OOV,OOV,OOV,OOV,OOV \\
    44 &  OOV,OOV,OOV,OOV,OOV \\
    45 &  OOV,OOV,OOV,OOV,OOV \\
    46 &  OOV,OOV,OOV,OOV,OOV \\
    47 &  OOV,OOV,OOV,OOV,OOV \\
    48 &  OOV,OOV,OOV,OOV,OOV \\
    49 &  OOV,OOV,OOV,OOV,OOV \\
\bottomrule
\end{tabular}