I scraped from some URLs in Ruby and extracted topics using LDA in Python.
gem install
$ bundle init
$ vim Gemfile
gem 'mechanize'
$ bundle install
It is OK if it works with the following sample file.
sample.rb
require 'mechanize'
agent = Mechanize.new
search_page = agent.get('Appropriate URL')
search_page.search('body p').each do |y|
  p y.text
end
$ brew search mecab
mecab mecab-ipadic
$ brew install mecab mecab-ipadic
$ mecab
OK if mecab starts
natto is a gem that wraps mecab installed on your system.
gem install
$ bundle init
$ vim Gemfile
gem 'natto'
$ bundle install
In order to use natto, you need to specify an environment variable called MECAB_PATH.
$ find /usr/ -name "*mecab*" | grep dylib 
$ export MECAB_PATH=/usr//local/Cellar/mecab/0.996/lib/libmecab.dylib
http://yatta47.hateblo.jp/entry/2015/12/13/150525 https://github.com/buruzaemon/natto
It is OK if it works with the following sample file.
sample.rb
require 'natto'
text = 'Of the thighs and thighs'
nm = Natto::MeCab.new
nm.parse(text) do |n|
  puts "#{n.surface}\t#{n.feature}"
end
http://qiita.com/shizuma/items/d04facaa732f606f00ff http://d.hatena.ne.jp/otn/20090509
It should be made originally, but omitted this time.
This time, instead, we exclude nouns, general pronouns and non-independence.
cond1 = features.include?('noun')
cond2 = features.include?('General')
cond3 = !features.include?('Pronoun')
cond4 = !features.include?('Non-independent')
if cond1 && cond2 && cond3 && cond4
  #Required processing
end
Data is exchanged between python and ruby using json. Specifically, prepare a csv that summarizes the URL of the target page as shown below, scrape it from there, and convert it to the data structure required for LDA.
| url | 
|---|
| URL1 | 
| URL2 | 
| ... | 
| URLN | 
Finally, the following array with words arranged for each document is generated and output as json.
[
  ['human', 'interface', 'computer'],
  ['survey', 'user', 'computer', 'system', 'response', 'time'],
  ['eps', 'user', 'interface', 'system'],
  ['system', 'human', 'system', 'eps'],
  ['user', 'response', 'time'],
  ['trees'],
  ['graph', 'trees'],
  ['graph', 'minors', 'trees'],
  ['graph', 'minors', 'survey']
]
http://tohka383.hatenablog.jp/entry/20111205/1323071336 http://peaceandhilightandpython.hatenablog.com/entry/2013/12/06/082106
gem 'mechanize'
gem 'natto'
#A class that generates an array of URLs from csv
class UrlGetService
  require 'csv'
  def initialize(csv_path)
    @csv_path = csv_path
  end
  def web_urls
    @web_urls ||= -> do
      rows = []
      csv_file.each_with_index do |row, index|
        unless index == 0
          rows << row[0]
        end
      end
      rows
    end.call
  end
  private
    attr_reader :csv_path
    def csv_file
      @csv_file ||= -> do
        csv_text = File.read(csv_path)
        CSV.parse(csv_text)
      end.call
    end
end
#A class that scrapes a given URL
class WebScrapingService
  require 'mechanize'
  def initialize(url)
    @url = url
  end
  def texts
    @texts ||= -> do
      texts = ''
      page_contents.each do |content|
        texts += content.text
      end
      texts
    end.call
  end
  private
    attr_reader :url
    def page_contents
      @page_contents ||= scraping_agent.get(url).search('body p')
    end
    def scraping_agent
      @scraping_agent ||= Mechanize.new
    end
end
#A class that morphologically parses scraping results and creates an array of words
class MorphologicalAnalysisService
  require 'natto'
  `export MECAB_PATH=/usr//local/Cellar/mecab/0.996/lib/libmecab.dylib`
  def initialize(texts)
    @texts = texts
  end
  def words
    words = []
    morphological_analysis_agent.parse(texts) do |word|
      features = word.feature.split(/,/)
      cond1 = features.include?('noun')
      cond2 = features.include?('General')
      cond3 = !features.include?('Pronoun')
      cond4 = !features.include?('Non-independent')
      if cond1 && cond2 && cond3 && cond4
        words << word.surface
      end
    end
    words
  end
  private
    attr_reader :texts
    def morphological_analysis_agent
      @morphological_analysis_agent ||= Natto::MeCab.new
    end
end
#Class that dumps JSON using 3 classes
class DictionaryOutputService
  require 'json'
  def initialize(csv_path)
    @csv_path = csv_path
  end
  def output_json
    open('sample.json', 'w') do |f|
      JSON.dump(words_array, f)
    end
  end
  private
    attr_reader :csv_path
    def words_array
      @words_array ||= -> do
        web_urls.each_with_object([]) do |url, arr|
          texts = WebScrapingService.new(url).texts
          words = MorphologicalAnalysisService.new(texts).words
          white_lists =  words.inject(Hash.new(0)) { |h, a| h[a] += 1; h }.select { |_, c| c > 1 }.map { |w, _| w }
          arr << words.select { |w| white_lists.include?(w) }
        end
      end.call
    end
    def web_urls
      UrlGetService.new(csv_path).web_urls
    end
end
#Execute as follows
csv_path = "YOUR_CSV_PATH/file_name.csv"
DictionaryOutputService.new(csv_path).output_json
Instead of using the system python as it is, use the installed and versioned python.
git clone https://github.com/yyuu/pyenv.git ~/.pyenv
~/.bashrc
export PYENV_ROOT=$HOME/.pyenv
export PATH=$PYENV_ROOT/bin:$PATH
eval "$(pyenv init -)"
If it is 3.5 series, you can not fall by installing gensim.
sourve ~/.bashrc
pyenv install 3.5.0
pyenv shell 3.5.0
http://qiita.com/Kodaira_/items/feadfef9add468e3a85b
To do LDA with python, use a module called gensim. setuptools required for gensim installation
sudo easy_install -U setuptools
Install gensim. Also update dependent tools such as numpy.
sudo -H pip install gensim -U
lda.py
from gensim import models, corpora
if __name__ == '__main__':
    #Originally, this texts reads JSON files etc.
    texts = [['human', 'interface', 'computer'],
             ['survey', 'user', 'computer', 'system', 'response', 'time'],
             ['eps', 'user', 'interface', 'system'],
             ['system', 'human', 'system', 'eps'],
             ['user', 'response', 'time'],
             ['trees'],
             ['graph', 'trees'],
             ['graph', 'minors', 'trees'],
             ['graph', 'minors', 'survey']]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=dictionary)
    # Topics
    for topic in lda.show_topics(-1):
        print('topic')
        print(topic)
    # Topic of each document
    for topics_per_document in lda[corpus]:
            print('topic of ecah document')
            print(topics_per_document)
https://radimrehurek.com/gensim/tut1.html#corpus-formats https://openbook4.me/projects/193/sections/1154 http://sucrose.hatenablog.com/entry/2013/10/29/001041
#Required Package Army
install.packages("lda")
install.packages("ggplot2")
install.packages("reshape2")
#Free data
data(cora.documents)
data(cora.vocab)
##Number of topics
K <- 10
#Function execution
result <- lda.collapsed.gibbs.sampler(cora.documents,
                                      K, #Number of topics
                                      cora.vocab,
                                      25, #Number of samplings
                                      0.1, #Hyper parameter α
                                      0.1, #Hyper parameter β
                                      compute.log.likelihood=TRUE) 
#Top 5 Frequent Words by Topic
top.words <- top.topic.words(result$topics, 5, by.score=TRUE)