[Python] Qiita article information is pushed into mongoDB

Thing you want to do

Get Qiita article information with Qiita API, which article and which tag is often seen? I want to check etc. As the first step, try registering the information acquired by API in mongoDB

Get Qiita article information

This time's content is written in Python. For the acquisition of article information, I referred to the following article.

Try using Qiita API from Python

import requests
import logging
import json

formatter = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
logging.basicConfig(level=logging.WARNING, format=formatter)
logger = logging.getLogger(__name__)

class GetQiitaInfo(object):

    def __init__(self):
        self.token = 'your token'        

    def get_next_url(self, response):
 "" "If there is a next page, the url is included as'rel =" next "', so the url is extracted and returned.
 If not, returns None.

        link: <https://qiita.com/api/v2/authenticated_user/items?page=1>;
        rel="first", <https://qiita.com/api/v2/authenticated_user/items?page=2>;
        rel="next", <https://qiita.com/api/v2/authenticated_user/items?page=4>;
        rel="last"

        :param response:
 : return: Next url
        """
        link = response.headers['link']
        if link is None:
            return None

        links = link.split(',')

        for link in links:

            if 'rel="next"' in link:
                return link[link.find('<') + 1:link.find('>')]
        return None
      
    def get_items(self):
        
 "" "Pagination to get all articles,
 Since the number of stocks and the number of views are not included in the list, the information is also added and returned.

        :param token:
 : return: List of articles
        """
        
        url = 'https://qiita.com/api/v2/authenticated_user/items'
        headers = {'Authorization': 'Bearer {}'.format(self.token)}

        items = []
        while True:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            items.extend(json.loads(response.text))
            logger.info('GET {}'.format(url))
 # Check if there is the following url
            url = self.get_next_url(response)
            if url is None:
                break

 # Get and add view and stock information for each article
 # page_views_count has a field in the list API but null is returned
        for item in items:

 #Number of views
            url = 'https://qiita.com/api/v2/items/{}'.format(item['id'])
            logger.info('GET {}'.format(url))
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            itemJson = json.loads(response.text)
            item['page_views_count'] = itemJson['page_views_count']
            item['tag1'] = itemJson['tags'][0]['name']
            item['tag2'] = itemJson['tags'][1]['name'] if len(itemJson['tags']) >= 2 else ''
            item['tag3'] = itemJson['tags'][2]['name'] if len(itemJson['tags']) >= 3 else ''
            item['tag4'] = itemJson['tags'][3]['name'] if len(itemJson['tags']) >= 4 else ''
            item['tag5'] = itemJson['tags'][4]['name'] if len(itemJson['tags']) >= 5 else ''

            tag_list = []
            for i in range(len(itemJson['tags'])):
                tag_list.append(itemJson['tags'][i]['name'])
            item['tag_list'] = tag_list

 #Stock quantity
            url = 'https://qiita.com/api/v2/items/{}/stockers'.format(item['id'])
            logger.info('GET {}'.format(url))
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            users = json.loads(response.text)
            for user in users:
                logger.info({
                    'id': user['id'],
                    'name': user['name']
                    })
            item['stocks_count'] = len(users)

        return items

For my own study, I made two changes from the article I referred to. ・ Classed -Added tag1 to tag5 and tag_list

mongoDB operation class

I wrote an article about mongoDB operation before, but it remains the same.

reference Operating mongodb with Python-Part 4: insert-

from pymongo import MongoClient

class MongoSample(object):

    def __init__(self, dbName, collectionName):
        self.client = MongoClient()
 self.db = self.client [dbName] #Set DB name
        self.collection = self.db.get_collection(collectionName)

    def find_one(self, projection=None,filter=None, sort=None):
        return self.collection.find_one(projection=projection,filter=filter,sort=sort)

    def find(self, projection=None,filter=None, sort=None):
        return self.collection.find(projection=projection,filter=filter,sort=sort)

    def insert_one(self, document):
        return self.collection.insert_one(document)

    def insert_many(self, documents):
        return self.collection.insert_many(documents)

Get article information and register it in mongoDB

from get_qiita_info import GetQiitaInfo
from mongo_sample import MongoSample

# Get Qiita article information
qiita = GetQiitaInfo()
items = qiita.get_items()

 arg1:DB Name
 arg2:Collection Name
mongo = MongoSample("db", "qiita")

# If you don't delete unnecessary key values
 mongo.insert_many(items)
# Bulk registration with

for item in items:
 # rendered_body / body is unnecessary, so delete it
    item.pop("rendered_body")
    item.pop("body")
 # Register one by one
    mongo.insert_one(item)

result = mongo.find_one()
print(result)

Let's take a look at mongoDB after executing the above code.

> db.qiita.findOne()
{
        "_id" : ObjectId("5e38ff43c92e7c532aeffb47"),
        "coediting" : false,
        "comments_count" : 0,
        "created_at" : "2020-02-04T13:37:44+09:00",
        "group" : null,
        "id" : "331ae2289a95f5a9b901",
        "likes_count" : 0,
        "private" : false,
        "reactions_count" : 0,
        "tags" : [
                {
                        "name" : "Python",
                        "versions" : [ ]
                },
                {
                        "name" : "Python3",
                        "versions" : [ ]
                }
        ],
 "title": "[Python] No value for argument'self' in unbound method call",
        "updated_at" : "2020-02-04T13:37:44+09:00",
        "url" : "https://qiita.com/bc_yuuuuuki/items/331ae2289a95f5a9b901",
        "user" : {
 "description": "I'm learning blockchain / AI / Python / Golang / MongoDB, etc. \ r \ nThe content posted on this site is my own opinion, not necessarily my position and strategy in my organization / company. , Not representative of opinion. ",,
                "facebook_id" : "",
                "followees_count" : 0,
                "followers_count" : 2,
                "github_login_name" : null,
                "id" : "bc_yuuuuuki",
                "items_count" : 28,
                "linkedin_id" : "",
                "location" : "",
                "name" : "",
                "organization" : "",
                "permanent_id" : 476876,
                "profile_image_url" : "https://pbs.twimg.com/profile_images/1157834557783072768/ktpc9kGV_bigger.jpg ",
                "team_only" : false,
                "twitter_screen_name" : "bc_yuuuuuki",
                "website_url" : ""
        },
        "page_views_count" : 54,
        "tag1" : "Python",
        "tag2" : "Python3",
        "tag_list" : [
                "Python",
                "Python3"
        ],
        "stocks_count" : 0
}

I was able to confirm that it was registered.

Impressions

In this code, the API acquisition result has been slightly modified, but it is convenient to be able to search and aggregate the JSON acquired by hitting the API without thinking about anything.

Recommended Posts

[Python] Qiita article information is pushed into mongoDB
[Python] Get user information and article information with Qiita API
About February 02, 2020 * This is a Python article.
What is python
Python is instance
Qiita, early Python ♪
What is Python