Skip to content

scraper.html: Rewrite srcset parsing #493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions wpull/scraper/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,11 +636,31 @@ def iter_links_by_js_attrib(self, attrib_name, attrib_value):

@classmethod
def iter_links_by_srcset_attrib(cls, attrib_name, attrib_value):
images = attrib_value.split(',')
links = [value.lstrip().split(' ', 1)[0] for value in images]
# https://web.archive.org/web/20250409200537/https://html.spec.whatwg.org/multipage/images.html#parsing-a-srcset-attribute
# (we ignore parse errors)
input = attrib_value

for link in links:
yield attrib_name, link
while True:
# 4. Splitting loop
input = re.sub(r'^[\s,]+', '', input)

# 5. If position is past the end of input
if not input:
break

# 6. Collect a sequence of code points that are not ASCII
# whitespace from input given position, and let that be url
url, input = re.match(r'(\S+)([\s\S]*)', input).groups()

if url.endswith(','):
url = url.rstrip(',')
else:
# I'm honestly too lazy to properly read through the spec to see
# how exactly things are parsed, so I'll just gonna assume that
# it skips over to the next comma or EOF
input = re.sub(r'^[^,]*(?:,|$)', '', input)

yield attrib_name, url

@classmethod
def is_link_inline(cls, tag, attribute):
Expand Down
64 changes: 64 additions & 0 deletions wpull/scraper/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,3 +529,67 @@ def get_html_parser(self):
class TestHTML5LibHTMLScraper(Mixin, unittest.TestCase):
def get_html_parser(self):
return HTML5LibHTMLParser()


class TestElementWalker(unittest.TestCase):
_srcset_tests = (
(
'https://example.com/one 1x, https://example.com/two 2x',
(
'https://example.com/one',
'https://example.com/two',
),
),
(
'https://example.com/one 100w, https://example.com/two 200w',
(
'https://example.com/one',
'https://example.com/two',
),
),
(
(
'https://example.com/w=100,h=100/why.jpg 100w,'
'https://example.com/w=200,h=200/why.jpg 200w'
),
(
'https://example.com/w=100,h=100/why.jpg',
'https://example.com/w=200,h=200/why.jpg',
),
),
(
(
'https://example.com/IAmAMassiveAsshole\t\t2e2x,\n'
'https://example.com/AndIHateYou\r\n-10x, '
),
(
'https://example.com/IAmAMassiveAsshole',
'https://example.com/AndIHateYou',
),
),
(
(
',,,https://example.com/it_is_wild_how_this_is_valid,,,'
'https://example.com/lmfao,,,,,'
),
(
'https://example.com/it_is_wild_how_this_is_valid',
'https://example.com/lmfao',
),
),
(
',,,,,',
(),
),
(
'',
(),
),
)
def test_iter_links_by_srcset_attrib(self):
for srcset, expected in self._srcset_tests:
actual = tuple(
value for _, value in
self.iter_links_by_srcset_attrib('srcset', srcset)
)
self.assertEqual(actual, expected)