3 분 소요

  • 가짜 데이터베이스를 만들어서 이미 가져온 파일이면 다시 안가져올 수 있도록 해서 웹사이트 속도 올리기

사용자가 검색한 keyword가 db에 있는지 확인

-main.py-

@app.route("/search")
def search():
  keyword = request.args.get("keyword")
  if keyword in db:
    jobs = db[keyword]
  else:
    indeed = extract_indeed_jobs(keyword)
    wwr = extract_wwr_jobs(keyword)
    jobs = indeed + wwr
  return render_template("search.html", keyword=keyword, jobs=jobs)
  • 만약 db안에 keyword가 있으면

  • Jobs는 db안의 keyword값과 같아서 이 jobs를 리턴함

  • 아니라면 원래대로 extract한 jobs를 리턴함

만약 keyword가 db에 없으면 jobs를 extract하고 그 값을 db에 저장해야함

@app.route("/search")
def search():
  keyword = request.args.get("keyword")
  if keyword in db:
    jobs = db[keyword]
  else:
    indeed = extract_indeed_jobs(keyword)
    wwr = extract_wwr_jobs(keyword)
    jobs = indeed + wwr
    db[keyword] = jobs
  return render_template("search.html", keyword=keyword, jobs=jobs)
  • db[keyword] = jobs

사용자가 keyword를 입력하지 않을때

  • Keyword는 None이됨
@app.route("/search")
def search():
  keyword = request.args.get("keyword")
  if keyword == None:
    return redirect("/")
  if keyword in db:
    jobs = db[keyword]
  else: 
      indeed = extract_indeed_jobs(keyword)
      wwr = extract_wwr_jobs(keyword)
      jobs = indeed + wwr
      db[keyword] = jobs
  return render_template("search.html", keyword=keyword, jobs=jobs)

db안에 없는 keyword로 export하려 할때

@app.route("/export")
def export():
  keyword = request.args.get("keyword")
  if keyword == None:
    return redirect("/")
  if keyword not in db:
    return redirect(f"/search?keyword={keyword}")

파일 ㅏㄴ들기

  1. Main,py에서

from flask import Flask, render_template, request, redirect, send_file

def export():
  keyword = request.args.get("keyword")
  if keyword == None:
    return redirect("/")
  if keyword not in db:
    return redirect(f"/search?keyword={keyword}")
  save_to_file(keyword, db[keyword])
  return send_file(f"{keyword}.csv", as_attachment=True)
  • From flask import 에 send_file 추가
  • Save_to_file(파일명은 사용자가입력한 keyword가 될것이고, jobs는 db안에 keyword가 될것임)
  • 사용자에게 보내야하니, send_file(파일명, 첨부파일)
  1. Search.html에서 export로 가는 링크 입략
    <hgroup>
      <h1>Search Results for":"</h1>
    <a target="_blank" href="/export?keyword="> 
    Export to file</a>

최종

-main.py-

from flask import Flask, render_template, request, redirect, send_file
from extractors.indeed import extract_indeed_jobs
from extractors.wwr import extract_wwr_jobs
from file import save_to_file
 
 
app = Flask("JobScrapper")
 
@app.route("/")
def home():
  return render_template("home.html", name="nico")
 
db = {
  
}
 
@app.route("/search")
def search():
  keyword = request.args.get("keyword")
  if keyword == None:
    return redirect("/")
  if keyword in db:
    jobs = db[keyword]
  else: 
      indeed = extract_indeed_jobs(keyword)
      wwr = extract_wwr_jobs(keyword)
      jobs = indeed + wwr
      db[keyword] = jobs
  return render_template("search.html", keyword=keyword, jobs=jobs)
 
@app.route("/export")
def export():
  keyword = request.args.get("keyword")
  if keyword == None:
    return redirect("/")
  if keyword not in db:
    return redirect(f"/search?keyword={keyword}")
  save_to_file(keyword, db[keyword])
  return send_file(f"{keyword}.csv", as_attachment=True)
  
app.run("0.0.0.0")

-indeed-

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
 
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)
 
def get_page_count(keyword):
  base_url = "https://kr.indeed.com/jobs?q="
  browser.get(f"{base_url}{keyword}")
  soup = BeautifulSoup(browser.page_source, "html.parser")
  pagination = soup.find("nav", class_="css-jbuxu0 ecydgvn0")
  if pagination == None:
    return 1
  pages = pagination.find_all("div", recursive=False)
  count = (len(pages))
  if count >= 5:
    return 5
  else:
    return count
 
 
def extract_indeed_jobs(keyword):
  pages = get_page_count(keyword)
  print("Found", pages, "pages")
  result = []
  for page in range(pages):
    base_url = "https://kr.indeed.com/jobs"
    final_url = f"{base_url}?q={keyword}&start={page*10}"
    print("Requesting", final_url)
    browser.get(final_url)
    soup = BeautifulSoup(browser.page_source, "html.parser")
    job_list = soup.find("ul", class_="jobsearch-ResultsList")
    jobs = job_list.find_all("li", recursive=False)
    for job in jobs:
      zone = job.find("div", class_="mosaic-zone")
      if zone == None:
        anchor = job.select_one("h2 a")
        title = anchor['aria-label']
        link = anchor['href']
        company = job.find("span", class_="companyName")
        location = job.find("div", class_="companyLocation")
        job_data = {
          'link': f"https://kr.indeed.com{link}",
          'company': company.string.replace(","," "),
          'location': location.string.replace(","," "),
          'position': title.replace(","," "),
        }
        result.append(job_data)
 
  return result

-wwr.py-

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
 
 
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)
 
def get_page_count(keyword):
  base_url = "https://kr.indeed.com/jobs?q="
  browser.get(f"{base_url}{keyword}")
  soup = BeautifulSoup(browser.page_source, "html.parser")
  pagination = soup.find("nav", class_="css-jbuxu0 ecydgvn0")
  if pagination == None:
    return 1
  pages = pagination.find_all("div", recursive=False)
  count = (len(pages))
  if count >= 5:
    return 5
  else:
    return count
 
 
def extract_indeed_jobs(keyword):
  pages = get_page_count(keyword)
  print("Found", pages, "pages")
  result = []
  for page in range(pages):
    base_url = "https://kr.indeed.com/jobs"
    final_url = f"{base_url}?q={keyword}&start={page*10}"
    print("Requesting", final_url)
    browser.get(final_url)
    soup = BeautifulSoup(browser.page_source, "html.parser")
    job_list = soup.find("ul", class_="jobsearch-ResultsList")
    jobs = job_list.find_all("li", recursive=False)
    for job in jobs:
      zone = job.find("div", class_="mosaic-zone")
      if zone == None:
        anchor = job.select_one("h2 a")
        title = anchor['aria-label']
        link = anchor['href']
        company = job.find("span", class_="companyName")
        location = job.find("div", class_="companyLocation")
        job_data = {
          'link': f"https://kr.indeed.com{link}",
          'company': company.string.replace(","," "),
          'location': location.string.replace(","," "),
          'position': title.replace(","," "),
        }
        result.append(job_data)
 
  return result

-home.html-

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width= , initial-scale=1.0">
  <title>Job scrapper</title>
  <link rel="stylesheet" href="https://unpkg.com/@picocss/pico@latest/css/pico.min.css">
 
</head>
<body>
  <main class="container">
    <h1>Job Scrapper</h1>
    <h4>Waht job do you want?</h4>
    <form action="/search">
     <input type="text" name="keyword" placeholder="Write keyword please!"/>
     <button>Search</button>
    </form>
  </main>
</body>
</html>

-search.html-

<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge">
  <meta name="viewport" content="width= , initial-scale=1.0">
  <title>Job scrapper</title>
  <link rel="stylesheet" href="https://unpkg.com/@picocss/pico@latest/css/pico.min.css">
</head>
<body>
  <main class="container">
    <hgroup>
      <h1>Search Results for":"</h1>
    <a target="_blank" href="/export?keyword="> 
    Export to file</a>
    </hgroup>
    <figure>
      <table role="grid"> 
      <thead>
        <tr>
          <th>Position</th>
          <th>Company</th>
          <th>Location</th>
          <th>Link</th>
        </tr>
      </thead>
      <tbody>
      
      </tbody>
    </table>
    </figure>
  </main>
</body>
</html>

-file.py-

def save_to_file(file_name, jobs):
  file = open(f"{file_name}.csv","w",encoding="utf-8-sig")
  file.write("Position,Company,Location,URL\n")
  
  for job in jobs:
    file.write(f"{job['position']},{job['company']},{job['location']},{job['link']}\n")
  
  file.close()

Replit주소: https://replit.com/@prayer0420/webscrapper-1#main.py

댓글남기기