From b79c16fdd49fad5b09a0aa810e55cea99bc094e3 Mon Sep 17 00:00:00 2001 From: Amanda-P-bus Date: Thu, 10 Apr 2025 17:32:32 -0400 Subject: [PATCH] Update spider.py Moved ' soup = BeautifulSoup(html, "html.parser") ' from line 92 to line 80. I was getting parsing errors every time I attempted to run it. Moving it above the if statements in the try finally got it running smoothly. I saw many people online with the same issues/error attempting to run this code so they could follow along with the Page Rank worked example. Hope this isn't too wordy, it's my very first pull request. --- code3/pagerank/spider.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code3/pagerank/spider.py b/code3/pagerank/spider.py index 822e9d3c9..9eb4200fa 100644 --- a/code3/pagerank/spider.py +++ b/code3/pagerank/spider.py @@ -77,6 +77,8 @@ document = urlopen(url, context=ctx) html = document.read() + soup = BeautifulSoup(html, "html.parser") + if document.getcode() != 200 : print("Error on page: ",document.getcode()) cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) ) @@ -86,10 +88,9 @@ cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) ) conn.commit() continue - + print('('+str(len(html))+')', end=' ') - soup = BeautifulSoup(html, "html.parser") except KeyboardInterrupt: print('') print('Program interrupted by user...')