[PYTHON] Text extraction from images of criteria for determining information on new coronavirus infections in Hyogo Prefecture

Screenshot_2020-12-31 兵庫県 緊急時用トップページ.png


import requests
from bs4 import BeautifulSoup

from urllib.parse import urljoin

url = "https://web.pref.hyogo.lg.jp/index.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

r = requests.get(url, headers=headers)

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.select_one("div#tmp_contents > p > img")

link = urljoin(url, tag.get("src"))

r = requests.get(link, headers=headers)

with open("alert.png ", mode="wb") as fw:


Install tesseract-ocr

!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v

!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract

Extract text from images

import pytesseract

import cv2
import numpy as np

from google.colab.patches import cv2_imshow

#There is a black one left on the edge, so cut it out a little
img_bgr = cv2.imread("alert.png ")[10:-10, 10:-10]

img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)

#Color confirmation
img_bgr[10, 10]

#Check the image

#Color count
black = np.sum(img_gray < 151)
white = np.sum(img_gray > 150)

#Check which is more white or black, and if there is more black, reverse
if white < black:
    ret, thresh = cv2.threshold(img_gray, 150, 255, cv2.THRESH_BINARY_INV)

    ret, thresh = cv2.threshold(img_gray, 150, 255, cv2.THRESH_BINARY)

#Check the image

txt = pytesseract.image_to_string(thresh, lang="jpn", config="--psm 6").strip()


