Convertissez le résultat de la recherche de balises (XML) de la vidéo Nico Nico au format CSV. De plus, les informations d'étiquette pour chaque œuvre sont regroupées dans une étiquette et une colonne, et ajoutées à chaque ligne. Plus précisément, le format est le suivant.
video_id, user_id, ..., tag 1, tag 2, ... sm00000001,111111111,...,1,1,... sm00000002,222222222,...,0,0,... sm00000003,333333333,...,0,1,...
ncxml2csv.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ncxml2csv.py
# Copyright (c) 2014 nezuq
# This software is released under the MIT License.
# http://opensource.org/licenses/mit-license.php
from __future__ import unicode_literals
import sys
import codecs
from lxml import etree
import copy
argvs = sys.argv
argc = len(argvs)
#Fichier d'entrée
FILE_INPUT = 'INPUT.xml'
if 1 < argc:
FILE_INPUT = argvs[1].decode('UTF-8')
#Seuil (ne pas afficher les colonnes avec des balises dont le nombre d'occurrences est en dehors de la plage spécifiée)
MIN_COUNT = 3
if 2 < argc:
MIN_COUNT = int(argvs[2])
MAX_COUNT = 9999
if 3 < argc:
MAX_COUNT = int(argvs[3])
#Indicateur de sortie de données d'origine
DISP_SRCCOL = 1
if 4 < argc:
DISP_SRCCOL = int(argvs[4])
#Nom de colonne
COLUMNS_NAME = ['video_id','user_id','deleted','title','description','length_in_seconds','length','size_high','size_low',
'movie_type','thumbnail_url','upload_time','first_retrieve','default_thread',
'view_counter','comment_num','mylist_counter',
'last_res_body','watch_url','thumb_type','embeddable','no_live_play',
'option_flag_ichiba','option_flag_community','option_flag_domestic','option_flag_comment_type',
'option_flag_adult','option_flag_mobile','option_flag_economy_mp4','option_flag_middle_video',
'option_flag_mobile_ng_apple','main_category','main_category_key',
'thread_id','thread_public','thread_num_res','thread_community_id','tags']
def main():
rows = []
tags = {}
tags_default_col = []
tree = etree.parse(FILE_INPUT)
for vi in tree.findall('./video_info'):
row = []
row.append(vi.find('video/id').text) #video_id
row.append(vi.find('video/user_id').text) #user_id
row.append(vi.find('video/deleted').text) #deleted
row.append(vi.find('video/title').text) #title
row.append(vi.find('video/description').text) #description
row.append(vi.find('video/length_in_seconds').text) #length_in_seconds
row.append('') #length
row.append('') #size_high
row.append(vi.find('video/size_low').text) #size_low
row.append(vi.find('video/movie_type').text) #movie_type
row.append(vi.find('video/thumbnail_url').text) #thumbnail_url
row.append(vi.find('video/upload_time').text) #upload_time
row.append(vi.find('video/first_retrieve').text) #first_retrieve
row.append(vi.find('video/default_thread').text) #default_thread
row.append(vi.find('video/view_counter').text) #view_counter
row.append('') #comment_num
row.append(vi.find('video/mylist_counter').text) #mylist_counter
row.append('') #last_res_body
row.append('') #watch_url
row.append('') #thumb_type
row.append('') #embeddable
row.append('') #no_live_play
row.append(vi.find('video/option_flag_ichiba').text) #option_flag_ichiba
row.append(vi.find('video/option_flag_community').text) #option_flag_community
row.append(vi.find('video/option_flag_domestic').text) #option_flag_domestic
row.append(vi.find('video/option_flag_comment_type').text) #option_flag_comment_type
row.append(vi.find('video/option_flag_adult').text) #option_flag_adult
row.append(vi.find('video/option_flag_mobile').text) #option_flag_mobile
row.append(vi.find('video/option_flag_economy_mp4').text) #option_flag_economy_mp4
row.append(vi.find('video/option_flag_middle_video').text) #option_flag_middle_video
row.append(vi.find('video/option_flag_mobile_ng_apple').text) #option_flag_mobile_ng_apple
row.append(vi.find('video/main_category').text) #main_category
row.append(vi.find('video/main_category_key').text) #main_category_key
row.append(vi.find('thread/id').text) #thread_id
row.append(vi.find('thread/public').text) #thread_public
row.append(vi.find('thread/num_res').text) #thread_num_res
row.append(vi.find('thread/community_id').text) #thread_community_id
row.append(etree.tostring(vi.find('tags'))) #tags
rows.append((map(lambda x:x.replace(',', ',') if x else '', row)))
tagname_per_row = map(lambda x:x.text, vi.findall('tags/tag_info/tag'))
tagname_all = list(set(tags.keys() + tagname_per_row))
for tagname in tagname_all:
if tagname not in tags.keys():
tags[tagname] = copy.copy(tags_default_col)
if tagname in tagname_per_row:
tags[tagname].append(1)
else:
tags[tagname].append(0)
tags_default_col.append(0)
tags_matched = []
for key,val in tags.items():
cnt = reduce(lambda x,y:x+y, val)
if MIN_COUNT <= cnt <= MAX_COUNT:
tags_matched.append((key, val, cnt))
sorted_tags = sorted(tags_matched, key = (lambda x:x[2]), reverse = True)
print ','.join((COLUMNS_NAME if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[0]), sorted_tags))
for i, row in enumerate(rows):
print ','.join((row if DISP_SRCCOL == 1 else []) + map(lambda x:unicode(x[1][i]), sorted_tags))
if __name__ == '__main__':
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
main()
ncxml2csv.Exécuter py
python ncxml2csv.py INPUT.xml 3 9999 1 > OUTPUT.csv
-> Comparaison de l'API i.nicovideo.jp et de l'API getthumbinfo
Recommended Posts