-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_script.py
90 lines (75 loc) · 2.96 KB
/
scraper_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
from bs4 import BeautifulSoup
import json
from kafka import KafkaProducer
from datetime import datetime
import time
import os
# Kafka producer setup
producer = KafkaProducer(
bootstrap_servers='localhost:9092',
value_serializer=lambda v: json.dumps(v).encode('utf-8')
)
# File to store scraped data
json_file = 'scraped_products.json'
# Ensure the JSON file exists
if not os.path.exists(json_file):
with open(json_file, 'w') as f:
json.dump([], f)
# Append product data to the JSON file
def append_to_json_file(data):
with open(json_file, 'r+') as f:
file_data = json.load(f)
file_data.append(data)
f.seek(0)
json.dump(file_data, f, indent=4)
# Scrape product details
def scrape_product_details(url):
response = requests.get(url)
if response.status_code != 200:
return None
soup = BeautifulSoup(response.text, 'html.parser')
return {
'name': soup.find('h1', class_='product_title').text.strip(),
'price': soup.find('p', class_='price').text.strip(),
'description': soup.find('div', class_='woocommerce-product-details__short-description').text.strip(),
'stock_number': soup.find('p', class_='stock in-stock').text.strip() if soup.find('p', class_='stock in-stock') else "Out of Stock",
'url': url
}
# Scrape main page and send data to Kafka + save to JSON file
def scrape_main_page():
page = 1
while True: # Infinite loop to keep scraping continuously
response = requests.get(f'https://scrapeme.live/shop/page/{page}/')
if response.status_code != 200:
print(f"Failed to retrieve page {page}: {response.status_code}")
time.sleep(10) # Wait before retrying
continue
soup = BeautifulSoup(response.text, 'html.parser')
products = soup.find_all('li', class_='product')
# If no products are found on the page, loop back to the first page
if not products:
print(f"No products found on page {page}. Returning to page 1.")
page = 1 # Reset to the first page
time.sleep(10) # Short sleep before restarting
continue
# Process all products on the current page
for product in products:
details = scrape_product_details(product.find('a')['href'])
if details:
details['timestamp'] = datetime.now().isoformat()
# Send to Kafka
producer.send('products_topic', details)
# Save to JSON file
append_to_json_file(details)
print(f"Sent and saved product: {details['name']}")
time.sleep(1) # Delay to avoid overwhelming the server
page += 1 # Move to the next page
# Run the scraper continuously
try:
while True:
scrape_main_page()
time.sleep(10) # Optional: Add a delay before restarting the scraping cycle
finally:
producer.flush()
producer.close()