-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_step_3_insta.py
82 lines (72 loc) · 2.81 KB
/
scrape_step_3_insta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import asyncio
import openpyxl
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
async def main(
output_file_path=r"fxout.xlsx",
insta_file_path=r"fxinsta2.xlsx",
):
# Load the workbook and worksheet for reading URLs
workbook_read = openpyxl.load_workbook(output_file_path, read_only=True)
worksheet_read = workbook_read.active
# Extract website URLs from the Excel file
website_urls = [
row[i]
for row in worksheet_read.iter_rows(
min_row=1, min_col=4, max_col=5, values_only=True
)
for i in range(len(row))
if row[i] and isinstance(row[i], str) and row[i].startswith("http")
]
workbook_read.close()
# Ensure the output workbook exists or create a new one
if not os.path.exists(insta_file_path):
workbook_write = openpyxl.Workbook()
worksheet_write = workbook_write.active
worksheet_write.title = "Instagram Links"
else:
workbook_write = openpyxl.load_workbook(insta_file_path)
worksheet_write = workbook_write.active
async def fetch_instagram_links_with_playwright(url, semaphore, browser):
async with semaphore:
page = await browser.new_page()
try:
await page.goto(url, timeout=30000) # 30 seconds timeout
content = await page.content()
soup = BeautifulSoup(content, "html.parser")
instagram_links = [
a["href"]
for a in soup.find_all("a", href=True)
if "instagram.com" in a["href"]
]
if instagram_links:
print(f"Instagram links found on {url}:")
for link in instagram_links:
print(link)
worksheet_write.append([link])
except Exception as e:
print(f"Error fetching data from {url}: {str(e)}")
finally:
await page.close()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
# ************************************* Control the number of concurrent tasks *************************************
semaphore = asyncio.Semaphore(5)
tasks = [
fetch_instagram_links_with_playwright(url, semaphore, browser)
for url in website_urls
]
await asyncio.gather(*tasks)
await browser.close()
# Save the workbook after all tasks are complete
try:
workbook_write.save(insta_file_path)
print(f"Data saved successfully to {insta_file_path}.")
except Exception as e:
print(f"Error saving workbook: {e}")
finally:
workbook_write.close()
# Run the main function
if __name__ == "__main__":
asyncio.run(main())