python-37: Cache
- 가짜 데이터베이스를 만들어서 이미 가져온 파일이면 다시 안가져올 수 있도록 해서 웹사이트 속도 올리기
사용자가 검색한 keyword가 db에 있는지 확인
-main.py-
@app.route("/search")
def search():
keyword = request.args.get("keyword")
if keyword in db:
jobs = db[keyword]
else:
indeed = extract_indeed_jobs(keyword)
wwr = extract_wwr_jobs(keyword)
jobs = indeed + wwr
return render_template("search.html", keyword=keyword, jobs=jobs)
-
만약 db안에 keyword가 있으면
-
Jobs는 db안의 keyword값과 같아서 이 jobs를 리턴함
-
아니라면 원래대로 extract한 jobs를 리턴함
만약 keyword가 db에 없으면 jobs를 extract하고 그 값을 db에 저장해야함
@app.route("/search")
def search():
keyword = request.args.get("keyword")
if keyword in db:
jobs = db[keyword]
else:
indeed = extract_indeed_jobs(keyword)
wwr = extract_wwr_jobs(keyword)
jobs = indeed + wwr
db[keyword] = jobs
return render_template("search.html", keyword=keyword, jobs=jobs)
db[keyword] = jobs
사용자가 keyword를 입력하지 않을때
- Keyword는 None이됨
@app.route("/search")
def search():
keyword = request.args.get("keyword")
if keyword == None:
return redirect("/")
if keyword in db:
jobs = db[keyword]
else:
indeed = extract_indeed_jobs(keyword)
wwr = extract_wwr_jobs(keyword)
jobs = indeed + wwr
db[keyword] = jobs
return render_template("search.html", keyword=keyword, jobs=jobs)
db안에 없는 keyword로 export하려 할때
@app.route("/export")
def export():
keyword = request.args.get("keyword")
if keyword == None:
return redirect("/")
if keyword not in db:
return redirect(f"/search?keyword={keyword}")
파일 ㅏㄴ들기
- Main,py에서
from flask import Flask, render_template, request, redirect, send_file
def export():
keyword = request.args.get("keyword")
if keyword == None:
return redirect("/")
if keyword not in db:
return redirect(f"/search?keyword={keyword}")
save_to_file(keyword, db[keyword])
return send_file(f"{keyword}.csv", as_attachment=True)
- From flask import 에 send_file 추가
- Save_to_file(파일명은 사용자가입력한 keyword가 될것이고, jobs는 db안에 keyword가 될것임)
- 사용자에게 보내야하니, send_file(파일명, 첨부파일)
- Search.html에서 export로 가는 링크 입략
<hgroup>
<h1>Search Results for":"</h1>
<a target="_blank" href="/export?keyword=">
Export to file</a>
최종
-main.py-
from flask import Flask, render_template, request, redirect, send_file
from extractors.indeed import extract_indeed_jobs
from extractors.wwr import extract_wwr_jobs
from file import save_to_file
app = Flask("JobScrapper")
@app.route("/")
def home():
return render_template("home.html", name="nico")
db = {
}
@app.route("/search")
def search():
keyword = request.args.get("keyword")
if keyword == None:
return redirect("/")
if keyword in db:
jobs = db[keyword]
else:
indeed = extract_indeed_jobs(keyword)
wwr = extract_wwr_jobs(keyword)
jobs = indeed + wwr
db[keyword] = jobs
return render_template("search.html", keyword=keyword, jobs=jobs)
@app.route("/export")
def export():
keyword = request.args.get("keyword")
if keyword == None:
return redirect("/")
if keyword not in db:
return redirect(f"/search?keyword={keyword}")
save_to_file(keyword, db[keyword])
return send_file(f"{keyword}.csv", as_attachment=True)
app.run("0.0.0.0")
-indeed-
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)
def get_page_count(keyword):
base_url = "https://kr.indeed.com/jobs?q="
browser.get(f"{base_url}{keyword}")
soup = BeautifulSoup(browser.page_source, "html.parser")
pagination = soup.find("nav", class_="css-jbuxu0 ecydgvn0")
if pagination == None:
return 1
pages = pagination.find_all("div", recursive=False)
count = (len(pages))
if count >= 5:
return 5
else:
return count
def extract_indeed_jobs(keyword):
pages = get_page_count(keyword)
print("Found", pages, "pages")
result = []
for page in range(pages):
base_url = "https://kr.indeed.com/jobs"
final_url = f"{base_url}?q={keyword}&start={page*10}"
print("Requesting", final_url)
browser.get(final_url)
soup = BeautifulSoup(browser.page_source, "html.parser")
job_list = soup.find("ul", class_="jobsearch-ResultsList")
jobs = job_list.find_all("li", recursive=False)
for job in jobs:
zone = job.find("div", class_="mosaic-zone")
if zone == None:
anchor = job.select_one("h2 a")
title = anchor['aria-label']
link = anchor['href']
company = job.find("span", class_="companyName")
location = job.find("div", class_="companyLocation")
job_data = {
'link': f"https://kr.indeed.com{link}",
'company': company.string.replace(","," "),
'location': location.string.replace(","," "),
'position': title.replace(","," "),
}
result.append(job_data)
return result
-wwr.py-
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)
def get_page_count(keyword):
base_url = "https://kr.indeed.com/jobs?q="
browser.get(f"{base_url}{keyword}")
soup = BeautifulSoup(browser.page_source, "html.parser")
pagination = soup.find("nav", class_="css-jbuxu0 ecydgvn0")
if pagination == None:
return 1
pages = pagination.find_all("div", recursive=False)
count = (len(pages))
if count >= 5:
return 5
else:
return count
def extract_indeed_jobs(keyword):
pages = get_page_count(keyword)
print("Found", pages, "pages")
result = []
for page in range(pages):
base_url = "https://kr.indeed.com/jobs"
final_url = f"{base_url}?q={keyword}&start={page*10}"
print("Requesting", final_url)
browser.get(final_url)
soup = BeautifulSoup(browser.page_source, "html.parser")
job_list = soup.find("ul", class_="jobsearch-ResultsList")
jobs = job_list.find_all("li", recursive=False)
for job in jobs:
zone = job.find("div", class_="mosaic-zone")
if zone == None:
anchor = job.select_one("h2 a")
title = anchor['aria-label']
link = anchor['href']
company = job.find("span", class_="companyName")
location = job.find("div", class_="companyLocation")
job_data = {
'link': f"https://kr.indeed.com{link}",
'company': company.string.replace(","," "),
'location': location.string.replace(","," "),
'position': title.replace(","," "),
}
result.append(job_data)
return result
-home.html-
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width= , initial-scale=1.0">
<title>Job scrapper</title>
<link rel="stylesheet" href="https://unpkg.com/@picocss/pico@latest/css/pico.min.css">
</head>
<body>
<main class="container">
<h1>Job Scrapper</h1>
<h4>Waht job do you want?</h4>
<form action="/search">
<input type="text" name="keyword" placeholder="Write keyword please!"/>
<button>Search</button>
</form>
</main>
</body>
</html>
-search.html-
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width= , initial-scale=1.0">
<title>Job scrapper</title>
<link rel="stylesheet" href="https://unpkg.com/@picocss/pico@latest/css/pico.min.css">
</head>
<body>
<main class="container">
<hgroup>
<h1>Search Results for":"</h1>
<a target="_blank" href="/export?keyword=">
Export to file</a>
</hgroup>
<figure>
<table role="grid">
<thead>
<tr>
<th>Position</th>
<th>Company</th>
<th>Location</th>
<th>Link</th>
</tr>
</thead>
<tbody>
</tbody>
</table>
</figure>
</main>
</body>
</html>
-file.py-
def save_to_file(file_name, jobs):
file = open(f"{file_name}.csv","w",encoding="utf-8-sig")
file.write("Position,Company,Location,URL\n")
for job in jobs:
file.write(f"{job['position']},{job['company']},{job['location']},{job['link']}\n")
file.close()
Replit주소: https://replit.com/@prayer0420/webscrapper-1#main.py
댓글남기기