python-30: 웹스크래퍼2
Recursive
jobs = job_list.find_all("li, recursive=False")
- ul바로 아래있는 li만을 찾아냄
None
for job in jobs:
zone = job.find("div", class_="mosaic-zone")
if zone == None:
print("job li")
else:
print("mosaic li")
- 변수에 넣을 수도 있고, list나 dictionary안에 넣어도 되는 데이터 타입
- 무언가 없을 때 사용함
- False와 같지 않음
select
- 검색할떄 CSS selector 로 검색함
h2 = job.find("h2", class_="jobTitle")
a = h2.find("a")
=
anchor = job.selet("h2 a")
- 하나의 select만 가져오게 하고 싶으면 select_one
anchor = job.select_one("h2 a")
* aria-label: HTML element에 제공하는 속성으로, 화면리더기가 읽게하길 원하는 텍스트를 나타냄
# from requests import get
# from extractors.wwr import extract_wwr_jobs
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)
browser.get("https://indeed.com/jobs?q=python")
result=[]
soup = BeautifulSoup(browser.page_source, "html.parser")
job_list = soup.find("ul", class_="jobsearch-ResultsList")
jobs = job_list.find_all("li", recursive=False)
for job in jobs:
zone = job.find("div", class_="mosaic-zone")
if zone == None:
anchor = job.select_one("h2 a")
title = anchor['aria-label']
link = anchor['href']
company = job.find("span", class_="companyName")
location = job.find("div", class_="companyLocation")
job_data={
'link' : f"https://indeed.com/{link}",
'company' : company.string.strip(),
'location' : location.string.strip(),
'position' : title,
}
result.append(job_data)
for results in result:
print(results, "\n///////\n")
페이지 수 얻기
# from requests import get
# from extractors.wwr import extract_wwr_jobs
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_page_count(keyword):
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
base_url = "https://indeed.com/jobs?q="
browser = webdriver.Chrome(options=options)
browser.get(f"{base_url}{keyword}")
soup = BeautifulSoup(browser.page_source, "html.parser")
pagination = soup.find("nav", class_="css-jbuxu0 ecydgvn0")
if pagination == 0:
return 11111111111111111111111
pages = pagination.find_all("div", recursive=False)
count = (len(pages))
if count >= 5:
return 5
else:
return count
print(get_page_count("python"))
print(get_page_count("nextjs"))
print(get_page_count("django"))
print(get_page_count("ruby"))
- 페이지 수를 가져오는 함수 만들기
- 만약 pagination이 None(없음)이라면 1을 리턴하고 숫자 세는것을 멈춤
- 그렇지 않다면, pagination의 숫자를 가져옴(만약 5보다 크거나 같으면 그냥 5를 반환, < 그렇지 않다면(5보다 작으면) 개수를 반환
range
- 순서 객체를 return해주는 함수
- 원하는 만큼의 list를 만들어주는 일종의 단축키
range(5)
for I in range(5):
print(i)
# from requests import get
# from extractors.wwr import extract_wwr_jobs
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_page_count(keyword):
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
base_url = "https://kr.indeed.com/jobs?q="
browser = webdriver.Chrome(options=options)
browser.get(f"{base_url}{keyword}")
soup = BeautifulSoup(browser.page_source, "html.parser")
pagination = soup.find("nav", class_="css-jbuxu0 ecydgvn0")
if pagination == None:
return 1
pages = pagination.find_all("div", recursive=False)
count = (len(pages))
if count >= 5:
return 5
else:
return count
def extract_indeed_jobs(keyword):
pages = get_page_count(keyword)
print("Found", pages, "pages")
result = []
for page in range(pages):
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)
base_url = "https://www.indeed.com/jobs"
final_url = f"{base_url}?q={keyword}&start={page*10}"
print("Requesting", final_url)
browser.get(final_url)
soup = BeautifulSoup(browser.page_source, "html.parser")
job_list = soup.find("ul", class_="jobsearch-ResultsList")
jobs = job_list.find_all("li", recursive=False)
for job in jobs:
zone = job.find("div", class_="mosaic-zone")
if zone == None:
anchor = job.select_one("h2 a")
title = anchor['aria-label']
link = anchor['href']
company = job.find("span", class_="companyName")
location = job.find("div", class_="companyLocation")
job_data = {
'link': f"https://www.indeed.com/{link}",
'company': company.string,
'location': location.string,
'position': title,
}
result.append(job_data)
return result
jobs = extract_indeed_jobs("python")
print(jobs)
print(len(jobs))
for page in range(pages)
와final_url에서 page*10
으로 인해
각 페이지마다 들어갈 수 있고, 정보를 받아 올 수 있음.- extractors폴더 안에 indeed.py 파일을 만든 후 , 위의 코드를 전부 복사붙여넣기한다.
두개 파일 합치기
from extractors.indeed import extract_indeed_jobs
from extractors.wwr import extract_wwr_jobs
keyword = input("What do oy want to search for?")
indeed = extract_indeed_jobs(keyword)
wwr = extract_wwr_jobs(keyword)
jobs = indeed + wwr
for job in jobs:
print(job)
print("/////\n/////")
댓글남기기