-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_step_5_compile.py
61 lines (51 loc) · 1.86 KB
/
scrape_step_5_compile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import asyncio
import pandas as pd
import scrape_step_1_links
import scrape_step_2_extract
import scrape_step_3_insta
import scrape_step_4_check
async def main(
place_name="Varanasi",
search_term="Aesthetics Clinic",
input_file_path=r"fx.xlsx",
output_file_path=r"fxout.xlsx",
insta_file_path=r"fxinsta.xlsx",
):
# Step 1: Extract links
await scrape_step_1_links.main(input_file_path, search_term, place_name)
# Remove duplicates from the input file
df = pd.read_excel(input_file_path)
df_cleaned = df.drop_duplicates()
df_cleaned.to_excel(input_file_path, index=False)
# Step 2: Extract information
await scrape_step_2_extract.main(input_file_path, output_file_path)
# Step 3: Extract Instagram links
await scrape_step_3_insta.main(output_file_path, insta_file_path)
# Remove duplicates from the Instagram file
df = pd.read_excel(insta_file_path)
df_cleaned = df.drop_duplicates()
df_cleaned.to_excel(insta_file_path, index=False)
# Define a function to check if a URL is unwanted
def is_unwanted_url(url):
if pd.isna(url): # Skip if URL is NaN
return False
url_str = str(url)
return any(
pattern in url_str
for pattern in [
"instagram.com/p/",
"instagram.com/v/",
"instagram.com/reel/",
"instagram.com/reels/",
]
)
# Reload the file after cleanup
df = pd.read_excel(insta_file_path) # Reload the file after cleanup
if df.shape[1] > 0: # Ensure there is at least one column
df_cleaned = df[~df.iloc[:, 0].apply(is_unwanted_url)]
df_cleaned.to_excel(insta_file_path, index=False)
else:
print("DataFrame does not contain any columns.")
# Run the asynchronous functions
if __name__ == "__main__":
asyncio.run(main())