using Beautiful Soup 4, Requests, and Python

Github

GitHub - haeeema/job_scrapper

Code

from requests import get
from bs4 import BeautifulSoup

base_url = "<https://weworkremotely.com/remote-jobs/search?term=>"
search_term = "python"

response = get(f"{base_url}{search_term}")
if response.status_code != 200:
    print("Can't request website")
else:
    results = []
    soup = BeautifulSoup(response.text, "html.parser")
    # "html.parser" tells Beautifulsoup to send HTML.
    jobs = soup.find_all("section", class_="jobs")
    # Find all the section that have the class of jobs.
    # class_="jobs" is *keyword argument.
    for job_section in jobs:
        job_posts = job_section.find_all("li")
        job_posts.pop(-1)
        # pop method is for removing view-all list, it is located on the last of the list.
        for post in job_posts:
            anchors = post.find_all("a")
            anchor = anchors[1]
            link = anchor["href"]
            company, kind, region = anchor.find_all("span", class_="company")
            # **Shortcut
            title = anchor.find("span", class_="title")
            job_data = {
                "link": f"<https://weworkremotely.com>{link}",
                "company": company.string,
                "region": region.string,
                "position": title.string,
            }
            results.append(job_data)
    for result in results:
        print(result)
        print("/////////////////////")

*Keyword argument

def say_hello(name, age):
		print(f"Hello {name} you are {age} years old")

say_hello("nico", 12)
say_hello(name="nico", age=12)
# It is a keyword argument, you don't care about the order.

**Shortcut

list_of_numbers = [1, 2, 3]

first = list_of_number[0]
second = list_of_number[1]
third = list_of_number[2]

first, second, third = list_of_numbers

# but it only works if you know the length of the list in advance.

CSS Selector

자손 태그

자식 태그

find vs select

find처럼 태그 이름, 속성, 속성값을 특정하는 방식은 같다. 하지만 CSS는 이 외에도 다양한 선택자(selector)를 갖기 때문에 여러 요소를 조합하여 태그를 특정하기 쉽다. 예를 들어 특정 경로의 태그를 객체로 반환하고 싶을 때, find의 경우 반복적으로 코드를 작성해야 한다. select는 직접 하위 경로를 지정할 수 있기 때문에 간편하다.

#find
soup.find('div').find('p')

#select
soup.select_one('div > p')