Créer un compteur de fréquence de mots avec Python 3.4

Chose que tu veux faire

Vidéo # 1, [Vidéo # 2](https://www.youtube.com/watch?v=up5Xehmtn236 = PL6gx4Cwl9DGAcbMi1sH6oAMk4JHw91mC_) et Vidéo # 3 Ce que je veux que vous fassiez, c'est d'aller sur le site spécifié et d'afficher les mots utilisés dans chaque lien de titre par ordre décroissant de fréquence.

Vidéo # 1

import requests
from bs4 import BeautifulSoup
import operator


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			print(each_word)
			word_list.append(each_word)
		

start("https://www.thenewboston.com/forum/")

1.Faites une liste word_list (pour lancer tous les mots que vous avez rompus plus tard ici) 2. Allez sur le site et enregistrez le texte html dans source_code 3. Utilisez Beautiful Soup pour coiffer 4. Utilisez le sélecteur CSS pour affiner les parties requises et extraire uniquement le texte qu'il contient en content. 5. Réduisez le contenu du "contenu" et séparez-les pour chaque espace et jetez-les en "mots" 6. Utilisez une boucle pour lancer chaque mot de words dans word_list J'avais envie de dire.

Production

dictionary
print
order
permanent
display
of
content
rendering
problems
whenever
i
start
the
android
studio
two
beginner
python
courses?
vector
about
double
buffering
arduino
code
asterisk
before
a
pointer
can
you
provide
me
the
arduino
code
for
eye
blinking
sensor(ir
sensor)
for
accidental
prevention.
can't
import
images
in
android
studio
can't
install
intel
haxm
free
internet
javascript
interpreter
lambda
function
my
funny
litlte
program
navigation
drawer
activity
not
able
to
find
the
problem
need
help
org.apache.http.client.httpclient
deprecated
question
about
themes
someone
share
a
link
to
source
codes??
source
code
?
which
all
views
should
be
turned
on?
x86
emulation
error
error
when
trying
to
build
and
run.
computer
doesn't
support
virtualization.
web
development
using
html
java
game
about
getting
user
input
eclipse
doesn't
recognise
my
imports
other
ways
of
styling

Vidéo # 2

import requests
from bs4 import BeautifulSoup
import operator


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			#print(word)
			clean_word_list.append(word)

start("https://www.thenewboston.com/forum/")

Dans la fonction start, j'ai même pris les caractères et les ai mis dans la word_list, mais cette fois je vais créer une fonction pour sélectionner les mots qui ont été pris. Par exemple, traiter des symboles autres que des mots et des lettres avec juste des espaces.

Créez d'abord clean_word_list
word_list qui contient le mot récupéré par la fonction start`` for loop qui boucle chaque mot (= mot)
Déterminez s'il faut faire correspondre chaque "mot" pour chaque "symbole". Si tel est le cas, remplacez-le par un blanc
Si la longueur de «mot» est supérieure à 0 (= pas seulement un blanc), ajoutez-la à «clean_word_list»

Production

variables
in
enum
dictionary
print
order
permanent
display
of
content
rendering
problems
whenever
i
start
the
android
studio
two
beginner
python
courses
vector
about
double
buffering
arduino
code
asterisk
before
a
pointer
can
you
provide
me
the
arduino
code
for
eye
blinking
sensorir
sensor
for
accidental
prevention
cant
import
images
in
android
studio
cant
install
intel
haxm
free
internet
javascript
interpreter
lambda
function
my
funny
litlte
program
navigation
drawer
activity
not
able
to
find
the
problem
need
help
orgapachehttpclienthttpclient
deprecated
question
about
themes
someone
share
a
link
to
source
codes
source
code
which
all
views
should
be
turned
on
x86
emulation
error
error
when
trying
to
build
and
run
computer
doesnt
support
virtualization
web
development
using
html
java
game
about
getting
user
input
eclipse
doesnt
recognise
my
imports

Vidéo # 3

import requests
from bs4 import BeautifulSoup
import operator #allows you to work with build-in data types in python


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			print(word)
			clean_word_list.append(word)
	create_dictionary(clean_word_list)

def create_dictionary(clean_word_list):
	word_count = {}
	for word in clean_word_list:
		if word in word_count:
			word_count[word] += 1 # word_count[word]Le nombre est incrémenté de un
		else:
			word_count[word] = 1
	for key, value in sorted(word_count.items(), key = operator.itemgetter(1)):
	#go to the dic. and get an item from the dic.
	# key = 0 and value = 1, so if you wanted to sort by key then operator.itemgetter(0) = alphabetical order
	
		print(key, value)

start("https://www.thenewboston.com/forum/")

Créez une fonction create_dictionary qui peut enregistrer un mot comme clé avec la fréquence d'utilisation des mots comme valeur. Si vous utilisez déjà la syntaxe if, créez-en un nouveau si vous ne souhaitez pas ajouter un point. Utilisez for key, value in sorted (word_count.items (), key = operator.itemgetter (1)) pour extraire des mots du dictionnaire et trier les mots par ordre décroissant de valeur.

Production

variables
in
enum
dictionary
print
order
permanent
display
of
content
rendering
problems
whenever
i
start
the
android
studio
two
beginner
python
courses
vector
about
double
buffering
arduino
code
asterisk
before
a
pointer
can
you
provide
me
the
arduino
code
for
eye
blinking
sensorir
sensor
for
accidental
prevention
cant
import
images
in
android
studio
cant
install
intel
haxm
free
internet
javascript
interpreter
lambda
function
my
funny
litlte
program
navigation
drawer
activity
not
able
to
find
the
problem
need
help
orgapachehttpclienthttpclient
deprecated
question
about
themes
someone
share
a
link
to
source
codes
source
code
which
all
views
should
be
turned
on
x86
emulation
error
error
when
trying
to
build
and
run
computer
doesnt
support
virtualization
web
development
using
html
java
game
about
getting
user
input
eclipse
doesnt
recognise
my
imports
courses 1
images 1
order 1
litlte 1
i 1
link 1
variables 1
input 1
when 1
someone 1
pointer 1
vector 1
x86 1
buffering 1
on 1
of 1
blinking 1
recognise 1
beginner 1
enum 1
javascript 1
should 1
need 1
eclipse 1
computer 1
dictionary 1
virtualization 1
navigation 1
can 1
permanent 1
provide 1
prevention 1
print 1
function 1
game 1
internet 1
html 1
question 1
rendering 1
deprecated 1
you 1
turned 1
orgapachehttpclienthttpclient 1
find 1
haxm 1
activity 1
asterisk 1
using 1
which 1
intel 1
double 1
all 1
support 1
problem 1
two 1
funny 1
whenever 1
display 1
problems 1
sensor 1
accidental 1
java 1
interpreter 1
me 1
eye 1
help 1
before 1
imports 1
getting 1
development 1
trying 1
import 1
not 1
drawer 1
install 1
codes 1
views 1
be 1
user 1
share 1
themes 1
web 1
content 1
able 1
program 1
build 1
sensorir 1
python 1
emulation 1
and 1
start 1
run 1
lambda 1
free 1
in 2
for 2
android 2
arduino 2
cant 2
error 2
doesnt 2
studio 2
a 2
my 2
source 2
the 3
to 3
code 3
about 3

Post-scriptum:

J'ai reçu des conseils de @lazykyama selon lesquels "Si vous utilisez Counter du module collections, vous n'aurez presque jamais besoin de la fonction créée dans Video 3, "j'ai donc décidé de l'implémenter immédiatement.

import requests
from bs4 import BeautifulSoup
import operator #allows you to work with build-in data types in python
from collections import Counter


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			#print(word)
			clean_word_list.append(word)

	counts = Counter(clean_word_list)
	print(counts)

start("https://www.thenewboston.com/forum/")

Voici la sortie:

Counter({'the': 9, 'to': 5, 'i': 5, 'with': 5, 'program': 3, 'image': 3, 'code': 3, 'web': 3, 'help': 3, 'simple': 3, 'source': 3, 'crawler': 3, 'a': 3, 'in': 3, 'am': 2, 'not': 2, 'error': 2, 'cant': 2, 'is': 2, 'my': 2, 'images': 2, 'when': 2, 'getting': 2, 'tutorial': 2, 'about': 2, 'for': 2, 'need': 2, 'app': 2, 'problem': 2, 'android': 2, 'find': 2, 'and': 2, 'studio': 1, 'running': 1, 'clock': 1, 'selenium': 1, 'codes': 1, 'mergesort': 1, 'it': 1, 'trouble': 1, 'someone': 1, 'please': 1, 'webpage': 1, 'method': 1, 'beginners': 1, 'camera': 1, 'lambda': 1, 'specified': 1, 'build': 1, 'buying': 1, 'development': 1, 'dosent': 1, 'run': 1, 'of': 1, 'anything': 1, 'mac': 1, 'reference': 1, 'mistake': 1, 'linked': 1, 'haxm': 1, 'list': 1, 'now': 1, 'trying': 1, 'on': 1, 'typecasting': 1, 'got': 1, 'current': 1, 'imagemap': 1, 'question': 1, 'undefined': 1, 'assignment': 1, 'population': 1, 'import': 1, 'able': 1, 'apple': 1, 'system': 1, 'needs': 1, 'show': 1, 'prepaid': 1, 'install': 1, 'how': 1, 'cannot': 1, 'hover': 1, 'add': 1, 'video': 1, '4': 1, 'default': 1, 'involving': 1, 'inserting': 1, 'you': 1, 'only': 1, 'function': 1, 'file': 1, 'themes': 1, 'this': 1, '28': 1, 'chooser': 1, 'refresh': 1, 'share': 1, 'link': 1, 'where': 1, 'tagif': 1, 'tip': 1, 'practice': 1, 'python': 1, 'get': 1, 'visa': 1, 'environment': 1, 'funny': 1, 'possible': 1, '42': 1, 'css': 1, 'step': 1, 'bitcoins': 1, 'time': 1, 'which': 1, 'variable': 1, 'date': 1, 'litlte': 1, 'as': 1, 'override': 1, 'capture': 1, 'effect': 1, 'intel': 1, 'can': 1, 'but': 1, 'at': 1, 'bug': 1, 'onattach': 1, 'loop': 1, 'what': 1})

C'est pratique car cela vous rend mince et le met ensemble dans un dictionnaire.

De plus, il semble qu'il soit possible de spécifier un mot spécifique et d'afficher sa fréquence d'utilisation. Par exemple, si vous souhaitez afficher la fréquence du seul mot «the»:

import requests
from bs4 import BeautifulSoup
import operator #allows you to work with build-in data types in python
from collections import Counter


def start(url):
	word_list = []
	source_code = requests.get(url).text #gonna connect to the link and use it as plain text
	soup = BeautifulSoup(source_code, 'html.parser')
	for post_text in soup.findAll('a', {'class': 'title text-semibold'}): #go through all the contents
		content = post_text.string #.string = only get the texts thats inside "soup"
		words = content.lower().split()
		for each_word in words:
			word_list.append(each_word)
	clean_up_list(word_list)

def clean_up_list(word_list):
	clean_word_list = []
	for word in word_list:
		symbols = "!@#$%^&*()_+{}:\"<>?,./;'[]-="
		for i in range(0, len(symbols)): 
			word = word.replace(symbols[i], "") #and replace it with nothing (=delete) if finds any symbols
		if len(word) > 0: #allows it to take only the actual clean words
			#print(word)
			clean_word_list.append(word)

	counts = Counter(clean_word_list)
	specific = counts["the"] #9
	print(specific)

start("https://www.thenewboston.com/forum/")

Vous pouvez également changer la fréquence des mots dans le dictionnaire à volonté en utilisant le nombre, ce qui n'est pas possible avec le dictionnaire normal avec comptes ["le"] = 15. Avec count [" the "] = 0, vous pouvez l'amener à la fin du dictionnaire. Il peut également être supprimé avec del count [1].

Vous pouvez également créer une liste avec x = list (count.elements ()).

#Idem que ci-dessus, donc omis
	counts = Counter(clean_word_list)
	counts_list = list(counts.elements())
	print(counts_list)

start("https://www.thenewboston.com/forum/")

['please', 'problem', 'problem', 'add', 'crawler', 'crawler', 'crawler', 'running', 'specified', 'is', 'is', 'dosent', 'practice', 'intel', 'anything', 'show', 'mergesort', 'image', 'image', 'image', 'list', 'import', 'tip', 'loop', 'am', 'am', 'getting', 'getting', 'population', 'get', 'buying', 'for', 'for', 'about', 'about', 'which', '4', 'on', 'prepaid', 'mistake', 'override', 'got', 'function', 'share', 'as', 'clock', 'reference', 'cannot', 'bitcoins', 'effect', 'code', 'code', 'code', 'assignment', 'you', 'can', 'images', 'images', 'haxm', 'find', 'find', 'install', 'with', 'with', 'with', 'with', 'with', 'trying', 'file', 'and', 'and', 'what', 'android', 'android', 'typecasting', 'source', 'source', 'source', 'beginners', 'someone', 'possible', 'cant', 'cant', 'how', 'method', 'app', 'app', 'i', 'i', 'i', 'i', 'i', 'system', 'where', 'webpage', 'involving', 'funny', 'current', 'it', 'linked', 'in', 'in', 'in', 'variable', 'web', 'web', 'web', 'hover', 'litlte', 'question', 'tagif', 'time', 'inserting', 'trouble', 'program', 'program', 'program', 'bug', '42', 'tutorial', 'tutorial', 'need', 'need', 'video', 'lambda', 'date', 'chooser', 'run', 'error', 'error', 'default', 'to', 'to', 'to', 'to', 'to', 'of', 'apple', 'link', 'when', 'when', 'capture', 'mac', 'css', 'step', 'refresh', 'not', 'not', 'imagemap', 'development', 'camera', 'but', 'simple', 'simple', 'simple', 'needs', 'help', 'help', 'help', 'studio', 'a', 'a', 'a', '28', 'selenium', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'now', 'themes', 'environment', 'python', 'visa', 'only', 'this', 'able', 'undefined', 'onattach', 'build', 'at', 'my', 'my', 'codes']

most_frequent = count.most_common (2) montre les deux mots les plus fréquemment utilisés most_frequent = count.most_common (2) print (most_frequent [1]) affiche le deuxième mot le plus fréquemment utilisé