using Beautiful Soup 4, Requests, and Python
Github
Code
from requests import get
from bs4 import BeautifulSoup
base_url = "<https://weworkremotely.com/remote-jobs/search?term=>"
search_term = "python"
response = get(f"{base_url}{search_term}")
if response.status_code != 200:
print("Can't request website")
else:
results = []
soup = BeautifulSoup(response.text, "html.parser")
# "html.parser" tells Beautifulsoup to send HTML.
jobs = soup.find_all("section", class_="jobs")
# Find all the section that have the class of jobs.
# class_="jobs" is *keyword argument.
for job_section in jobs:
job_posts = job_section.find_all("li")
job_posts.pop(-1)
# pop method is for removing view-all list, it is located on the last of the list.
for post in job_posts:
anchors = post.find_all("a")
anchor = anchors[1]
link = anchor["href"]
company, kind, region = anchor.find_all("span", class_="company")
# **Shortcut
title = anchor.find("span", class_="title")
job_data = {
"link": f"<https://weworkremotely.com>{link}",
"company": company.string,
"region": region.string,
"position": title.string,
}
results.append(job_data)
for result in results:
print(result)
print("/////////////////////")
*Keyword argument
def say_hello(name, age):
print(f"Hello {name} you are {age} years old")
say_hello("nico", 12)
say_hello(name="nico", age=12)
# It is a keyword argument, you don't care about the order.
**Shortcut
list_of_numbers = [1, 2, 3]
first = list_of_number[0]
second = list_of_number[1]
third = list_of_number[2]
first, second, third = list_of_numbers
# but it only works if you know the length of the list in advance.
CSS Selector
자손 태그
자식 태그
find vs select
find처럼 태그 이름, 속성, 속성값을 특정하는 방식은 같다. 하지만 CSS는 이 외에도 다양한 선택자(selector)를 갖기 때문에 여러 요소를 조합하여 태그를 특정하기 쉽다. 예를 들어 특정 경로의 태그를 객체로 반환하고 싶을 때, find의 경우 반복적으로 코드를 작성해야 한다. select는 직접 하위 경로를 지정할 수 있기 때문에 간편하다.
#find
soup.find('div').find('p')
#select
soup.select_one('div > p')