I am trying to scroll webpage with Selenium- “https://jobsearch.az/vacancies”. But, you see when you open it and click on vacancy, there are two pages side by side to scroll. I need to scroll the one in the middle, so selenium can go and take info from other jobs too. But, now, it stops when it comes to 14, which is all you can see if you dont scroll.
from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.keys import Keys import time import pandas as pd from selenium.webdriver.common.action_chains import ActionChains path = "C:/Users/nihad/OneDrive/Documents/aInternship/chromedriver.exe" driver = webdriver.Chrome(path) url = "https://jobsearch.az/vacancies" driver.get(url) time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') lists = soup.find_all('div', {'class': 'list__item'}) jobs_list=[] print('A') x=True while x: print('B') driver.maximize_window() for index, _list in enumerate(lists): link = _list.find( 'a', class_="list__item__text")["href"] current_url = "https://jobsearch.az"+link driver.get(current_url) time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') jobs = soup.find_all('div', {'class': 'vacancy'}) for index, job in enumerate(jobs): company = job.find( # div.a. 'div', class_="vacancy__start").text.strip() print(f'Company:{company}') category = job.find( # div.a. 'span', class_="company__industry").text print(f'Category:{category}') # key_requirements = job.find( # div.a. # 'div', class_="content-text").text # print(f'Key requirements:{key_requirements}') job_title = job.find( # div.a. 'h1', class_="vacancy__title").text.strip() print(f'Job title: {job_title}') deadline = job.find( # div.a. 'span', class_="vacancy__deadline").text.strip() print(f'Deadline: {deadline}') views = _list.find('div', class_="list__item__end").text.strip() print(f'Views: {views}') data = { "job_title":job_title, "company":company, "category":category, "deadline":deadline, "views":views } jobs_list.append(data) driver.minimize_window() print('C') driver.find_element_by_xpath('//h3[@class="list__item__title"]').click() driver.execute.script("window.scrollBy(0, document.body.scrollHeight)") for i in range(5): driver.find_element_by_tag_name('h3').send_keys(Keys.END) time.sleep(4) driver.execute_script("window.scrollBy(0, 1000)", "") time.sleep(5) print('after executing scrolling') # element = driver.find_element_by_class_name('vacancy__title') # actions = ActionChains(driver) # actions.move_to_element(element).perform() dataframe = pd.DataFrame(jobs_list) dataframe driver.close()
I wrote all 3 techniques, but nothing works.
Advertisement
Answer
I don’t know if you need to use selenium. But I have an example on another library – requests. Since the api uses cookies, we must use the session to pass the XSRF token.
import requests import pandas as pd jobs_list = [] def foo(url): headers = { 'accept': 'application/json, text/plain, */*', 'x-requested-with': 'XMLHttpRequest' } response = session.request("GET", url, headers=headers) for job in response.json()['items']: data = { "job_title": job['title'], "company": job['company']['title'], "category": job['category']['title'], "deadline": job['deadline_at'], "views": job['view_count'] } jobs_list.append(data) if 'next' in response.json(): foo(response.json()['next']) session = requests.session() response = session.get('https://jobsearch.az/vacancies') foo('https://jobsearch.az/api-az/vacancies-az?hl=az') dataframe = pd.DataFrame(jobs_list) dataframe
OUTPUT:
job_title ... views 0 Revenue manager ... 356 1 Operator (Satış təmsilçisi) ... 236 2 Satıcı (xanım) ... 766 3 ADM ISO 9001 Beynəlxalq Sertifikat Proqramını ... ... 1.6K 4 Avto-sənayedə operator montajçısı ... 218 ... ... ... ... 1656 Receptionist (gecə növbəsi) ... 735 1657 Складчик ... 400 1658 Android proqramçı ... 358 1659 Credit Risk Specialist ... 587 1660 İpoteka üzrə mütəxəssis, aparıcı mütəxəssis (B... ... 439 [1661 rows x 5 columns]