TL;DR
使用 Python 的 requests 模組、json 模組與 beautifulsoup4 模組擷取 Cake Resume、104 人力銀行、Yourator 這三個求職網站「資料分析師」的雇主、職稱與職缺描述,爬蟲程式執行的結果我放在一份 Google Sheet 之中:
https://docs.google.com/spreadsheets/d/1hCljd0EdbyODlmDyRg2tBte22gJsyTXM69-eO87OpR4/edit?usp=sharing
緣起
我的「如何成為資料分析師」課程是在 2020 年上架到 Hahow 好學校,這堂課雖然是以概論的角度定位,訴求的是 No-code、甚至 Low-code 避免在一開始就把有興趣的學生嚇跑,不過為了忠實於數據驅動(Data-driven)的思維,還是以 Python 寫了爬蟲程式去 CakeResume 與 Indeed 這兩個網站去爬取、解析資料分析師的職缺描述。而隨著時間的流逝,隨著網站版面的更新、防爬蟲機制的升級,當時所提供的爬蟲程式都已經無法運作。
從課程維護的角度思考,示範爬蟲程式最佳的做法應該是把靜態網頁資料內容以檔案方式下載,然後再進行解析,如此一來才能確保同樣內容的程式在任何時候都能夠成功操作,藉此將爬蟲的兩個核心任務:請求(Requests)與解析(Parsing)簡化為解析這個單一任務,而當時所選擇的求職網站 Indeed 現在已經採用 Cloudflare 服務來防 DDoS 攻擊與反爬蟲,因此這次寫的程式就剔除了 Indeed,從 Cake Resume、104 人力銀行與 Yourator 三個求職網站搜尋 “Data Analyst“ 來蒐集職缺描述。
三個求職網站的解析順序都是分兩個步驟,首先前往搜尋結果頁面將職缺頁面連結擷取下來,接著透過連結前往職缺頁面將雇主、職稱還有職缺描述進行解析。
Cake Resume
以 Cake Resume 的搜尋結果前五頁為例,先將這五頁的靜態網頁資料下載到本機的專案資料夾。
def download_first_five_pages():
for page_number in range(1, 6):
html_to_save = f"https://www.cakeresume.com/jobs/data%20analyst?location_list%5B0%5D=Taiwan&job_type%5B0%5D=full_time&seniority_level%5B0%5D=entry_level&page={page_number}"
r = requests.get(html_to_save)
with open(f"cake_resume/search_result_page_{page_number}.html", "w") as file:
file.write(r.text)
接著再從這五頁的靜態網頁資料把職缺的連結擷取成為一個清單。
def get_job_url_list() -> list:
job_title_css_selector = "div.JobSearchItem_headerTitle__CuE3V > a"
job_url_list = []
for page_number in range(1, 6):
with open(f"cake_resume/search_result_page_{page_number}.html") as file:
soup = BeautifulSoup(file, 'html.parser')
job_url_hrefs = [elem.get("href") for elem in soup.select(job_title_css_selector)]
job_urls = [f"https://www.cakeresume.com{job_url_href}" for job_url_href in job_url_hrefs]
job_url_list += job_urls
return job_url_list
再將每一個職缺頁面下載到本機的專案資料夾。
def download_job_descriptions(job_url_list: list):
for job_url in job_url_list:
r = requests.get(job_url)
soup = BeautifulSoup(r.text)
page_name = job_url.split("/")[-1]
with open(f"cake_resume/job_descriptions/{page_name}.html", "w") as file:
file.write(r.text)
最後再針對這些職缺頁面,將雇主、職稱還有職缺描述解析儲存到資料框中。
download_first_five_pages()
job_url_list = get_job_url_list()
download_job_descriptions(job_url_list)
list_dir = os.listdir("cake_resume/job_descriptions/")
job_titles, employers, job_descriptions = [], [], []
for html_file in list_dir:
with open(f"cake_resume/job_descriptions/{html_file}") as file:
soup = BeautifulSoup(file, 'html.parser')
job_title = soup.select("h2")[0].text
employer = soup.select("a.JobDescriptionLeftColumn_name__ABAp9")[0].text
job_description = soup.select("div.JobDescriptionBlock_container__W_6Sp > div > div.JobDescriptionBlock_content__EldPn > div.JobDescriptionBlock_leftColumn__16yxb > div > div.JobDescriptionLeftColumn_row__iY44x.JobDescriptionLeftColumn_mainContent__VrTGs")[0].text
job_titles.append(job_title)
employers.append(employer)
job_descriptions.append(job_description)
df = pd.DataFrame()
df["employer"] = employers
df["job_title"] = job_titles
df["job_description"] = job_descriptions
df
104 人力銀行
以 104 人力銀行的搜尋結果前五頁為例,先將這五頁的靜態網頁資料下載到本機的專案資料夾。
def download_first_five_pages():
for page_number in range(1, 6):
html_to_save = f"https://www.104.com.tw/jobs/search/?ro=1&kwop=7&keyword=%E8%B3%87%E6%96%99%E5%88%86%E6%9E%90%E5%B8%AB&expansionType=area%2Cspec%2Ccom%2Cjob%2Cwf%2Cwktm&order=12&asc=0&page={page_number}&mode=s&jobsource=2018indexpoc&langFlag=0&langStatus=0&recommendJob=1&hotJob=1"
r = requests.get(html_to_save)
with open(f"104/search_result_page_{page_number}.html", "w") as file:
file.write(r.text)
接著再從這五頁的靜態網頁資料把職缺的連結擷取成為一個清單。值得注意的地方是,104 人力銀行會將一些「熱門職缺」或者雇主有額外購買的職缺夾雜在搜尋結果之中,而這些職缺會跟我們的搜尋完全無關,因此我們可以判斷連結中是否有 “hotjob_chr“ 來剔除這些職缺。
def get_job_url_list() -> list:
job_title_css_selector = "h2 > a"
job_url_list = []
for page_number in range(1, 6):
with open(f"104/search_result_page_{page_number}.html") as file:
soup = BeautifulSoup(file, 'html.parser')
job_url_hrefs = [elem.get("href") for elem in soup.select(job_title_css_selector)]
job_urls = [f"https:{job_url_href}" for job_url_href in job_url_hrefs if "hotjob_chr" not in job_url_href]
job_url_list += job_urls
return job_url_list
再將每一個職缺頁面下載到本機的專案資料夾。
def download_job_descriptions(job_url_list: list):
for job_url in job_url_list:
r = requests.get(job_url)
soup = BeautifulSoup(r.text)
job_url_split = job_url.split("?")[0]
page_name = job_url_split.split("/")[-1]
with open(f"104/job_descriptions/{page_name}.html", "w") as file:
file.write(r.text)
最後再針對這些職缺頁面,將雇主、職稱還有職缺描述解析儲存到資料框中。
download_first_five_pages()
job_url_list = get_job_url_list()
download_job_descriptions(job_url_list)
list_dir = os.listdir("104/job_descriptions/")
job_titles, employers, job_descriptions = [], [], []
for html_file in list_dir:
with open(f"104/job_descriptions/{html_file}") as file:
soup = BeautifulSoup(file, 'html.parser')
job_title = soup.select("div.job-header__title > h1")[0].text
job_title = " ".join(job_title.split()[:-1])
employer = soup.select("div.job-header__title> div > a:nth-child(1)")[0].text.strip()
job_description = [elem.text for elem in soup.select("#app > div > div.container.jb-container.pt-4.position-relative > div > div.col.main > div.dialog.container-fluid.bg-white.rounded.job-description.mb-4.pt-6.pb-6 > div.job-description-table.row > div.job-description.col-12 > p")]
job_description = " ".join(job_description)
job_titles.append(job_title)
employers.append(employer)
job_descriptions.append(job_description)
df = pd.DataFrame()
df["employer"] = employers
df["job_title"] = job_titles
df["job_description"] = job_descriptions
df
Yourator
以 Yourator 的搜尋結果為例,值得注意的地方是,和前述的 Cake Resume 與 104 人力銀行不同,Yourator 並不是如前兩個網站將職缺連結儲存在 html 檔案的 <a></a> 標記之中,而是儲存在一個 JSON 檔案之中,因此我們採用的是 json 模組而非 beautifulsoup4 模組。另外一點值得注意的地方是,Yourator 除了會展示在自己網站登錄的職缺,也會展示合作夥伴 Teamdoor 登錄的職缺,因此我們可以判斷 “thirdPartyUrl“ 是否有值來決定是來自 Yourator 或 Teamdoor 的職缺。
def get_job_urls() -> list:
with open("yourator/jobs.json") as file:
jobs_json = json.load(file)
job_urls = [item["thirdPartyUrl"] if item["thirdPartyUrl"] is not None else "https://www.yourator.co" + item["path"] for item in jobs_json["payload"]["jobs"]]
return job_urls
再將每一個職缺頁面下載到本機的專案資料夾。
def download_job_descriptions(job_urls: list):
for job_url in job_urls:
r = requests.get(job_url)
soup = BeautifulSoup(r.text)
page_name = job_url.replace("/", "_")
with open(f"yourator/job_descriptions/{page_name}.html", "w") as file:
file.write(r.text)
最後再針對這些職缺頁面,將頁面區分為來自 Yourator 或 Teamdoor,再各自將雇主、職稱還有職缺描述解析儲存到資料框中。
job_urls = get_job_urls()
download_job_descriptions(job_urls)
list_dir = os.listdir("yourator/job_descriptions/")
job_titles, employers, job_descriptions = [], [], []
for html_file in list_dir:
with open(f"yourator/job_descriptions/{html_file}") as file:
soup = BeautifulSoup(file, 'html.parser')
if "teamdoor" in html_file:
job_title = soup.select("h2.title")[0].text.strip()
employer = html_file.split("__")[1]
employer = employer.split(".")[0]
job_description = [elem.text for elem in soup.select("div.content-area.content > div > div > div > p")]
job_description = " ".join(job_description)
job_titles.append(job_title)
employers.append(employer)
job_descriptions.append(job_description)
else:
job_title = soup.select("h1.basic-info__title__text")[0].text
employer = soup.select("h4 > a")[0].text
job_description = [elem.text for elem in soup.select("div > section > p")]
job_description = " ".join(job_description)
job_titles.append(job_title)
employers.append(employer)
job_descriptions.append(job_description)
df = pd.DataFrame()
df["employer"] = employers
df["job_title"] = job_titles
df["job_description"] = job_descriptions
df