Convert PDF of dealer list of Go To Eat Hokkaido Campaign to CSV
With tabula, pdfbox, poppler, the characters "Koshihan bear boiled new sardine crane" disappear.
No hits even if you search for "Kumako Ramen Higashi Asahikawa" in Northern Hokkaido. Hit "Child Ramen Higashi Asahikawa"
camelot and pdfplumber using pdfminer are exported in the form of "(cid: 1279)", so you can check the PDF display and replace it.
Creation software is "cubepdf", font is Yu Gothic
import camelot
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
url = "https://gotoeat-hokkaido.jp/general/particStores/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
dfs = []
for li in soup.select("ul.cf > li > a"):
link = urljoin(url, li.get("href"))
area = li.get_text(strip=True)
tables = camelot.read_pdf(link, split_text=True, pages="all", strip_text="\n", )
for table in tables:
df_tmp = pd.DataFrame(table.data[1:], columns=table.data[0])
df_tmp.columns = df_tmp.columns.map(lambda s: "".join(s.split()))
df_tmp["area"] = area
dfs.append(df_tmp)
df
df = pd.concat(dfs)
df = df.fillna("").applymap(
lambda s: s.replace("(cid:1279)", "Yue")
.replace("(cid:1535)", "Han")
.replace("(cid:1791)", "bear")
.replace("(cid:2303)", "Boiled")
.replace("(cid:2559)", "new")
.replace("(cid:2815)", "Noisy")
.replace("(cid:3071)", "crane")
)
#CJK radical/Replace Kangxi radical
tbl = str.maketrans(
"⺃_⺅ ⺉_⺋ ⺎_⺏ ⺐_⺒ ⺓_⺔ ⺖_⺘ ⺙_⺛ ⺟_⺠ ⺡_⺢ ⺣_⺦ ⺨_⺫ ⺬_⺭ ⺱_⺲ ⺹_⺾ ⻁_⻂ ⻃_⻄ ⻍_⻏ ⻑_⻒ ⻖_⻘ ⻟_⻤ ⻨_⻩ ⻫_⻭ ⻯_⻲ ⼀_⼁ ⼂_⼃ ⼄_⼅ ⼆_⼇ ⼈_⼉ ⼊_⼋ ⼌_⼍ ⼎_⼏ ⼐_⼑ ⼒_⼓ ⼔_⼕ ⼖_⼗ ⼘_⼙ ⼚_⼛ ⼜_⼝ ⼞_⼟ ⼠_⼡ ⼢_⼣ ⼤_⼥ ⼦_⼧ ⼨_⼩ ⼪_⼫ ⼬_⼭ ⼮_⼯ ⼰_⼱ ⼲_⼳ ⼴_⼵ ⼶_⼷ ⼸_⼹ ⼺_⼻ ⼼_⼽ ⼾_⼿ ⽀_⽁ ⽂_⽃ ⽄_⽅ ⽆_⽇ ⽈_⽉ ⽊_⽋ ⽌_⽍ ⽎_⽏ ⽐_⽑ ⽒_⽓ ⽔_⽕ ⽖_⽗ ⽘_⽙ ⽚_⽛ ⽜_⽝ ⽞_⽟ ⽠_⽡ ⽢_⽣ ⽤_⽥ ⽦_⽧ Water ⾚_⾛ ⾜_⾝ ⾞_⾟ ⾠_⾡ ⾢_⾣ ⾤_⾥ ⾦_⾧ ⾨_⾩ ⾪_⾫ ⾬_⾭ ⾮_⾯ ⾰_⾱ ⾲_⾳ ⾴_⾵ ⾶_⾷ ⾸_⾹ ⾺_⾻ ⾼_⾽ ⾾_⾿ ⿀_⿁ ⿂_⿃ ⿄_⿅ ⿆_⿇ ⿈_⿉ ⿊_⿋ ⿌⿍⿎⿏⿐⿑⿒⿓⿔⿕ 戶 黑",
"乚 亻 刂 㔾 兀 尣 尢 巳 幺 彑 忄 扌 攵 旡 Mother 氵 氺 灬 丬 犭 罒 礻 罓 轒 耂 艹 衤 衤 亅 庠 儿 儿 儿 儿 儿 儿 夊 凵 冖 冫 几 凵 sword power 勹 匕 匚 匸 10 卜 卩 厂 厶 厶 囗 囗 夂 夊 夊 夊 夊 彐 彡 彳 戈 戈 蔴 蔴 蔴 虤 kata 曰 曰 曰 曰 歹 殳 毋 诋 视 视 舻 Spear Yaishi 禸 禾 禾 覾 覾 覾 覾 覾 覾 耒 耒 耒 耒 耒 耒 聿 聿 聿 聿 聿 辵 酉 釆 釆 臆 辆 隶 隹 隹 靹 蟋 韭 蟭 蟭 蟭 蟭 蟭 觥 鬥 鬯 鬲 demon fish bird 鹵 deer 麥 黃 黍 black 黹 黽 鼎 Udo black",
)
df = df.applymap(lambda s: s.translate(tbl))
df.reset_index(drop=True, inplace=True)
df.index += 1
df.to_csv("gotoeat_hokkaido.csv", encoding="utf_8_sig")
Recommended Posts