ArchiveTeam · BlankEclair · Apr 10, 2025
diff --git a/wpull/scraper/html.py b/wpull/scraper/html.py
@@ -636,11 +636,31 @@ def iter_links_by_js_attrib(self, attrib_name, attrib_value):
 
     @classmethod
     def iter_links_by_srcset_attrib(cls, attrib_name, attrib_value):
-        images = attrib_value.split(',')
-        links = [value.lstrip().split(' ', 1)[0] for value in images]
+        # https://web.archive.org/web/20250409200537/https://html.spec.whatwg.org/multipage/images.html#parsing-a-srcset-attribute
+        # (we ignore parse errors)
+        input = attrib_value
 
-        for link in links:
-            yield attrib_name, link
+        while True:
+            # 4. Splitting loop
+            input = re.sub(r'^[\s,]+', '', input)
+
+            # 5. If position is past the end of input
+            if not input:
+                break
+
+            # 6. Collect a sequence of code points that are not ASCII
+            # whitespace from input given position, and let that be url
+            url, input = re.match(r'(\S+)([\s\S]*)', input).groups()
+
+            if url.endswith(','):
+                url = url.rstrip(',')
+            else:
+                # I'm honestly too lazy to properly read through the spec to see
+                # how exactly things are parsed, so I'll just gonna assume that
+                # it skips over to the next comma or EOF
+                input = re.sub(r'^[^,]*(?:,|$)', '', input)
+
+            yield attrib_name, url
 
     @classmethod
     def is_link_inline(cls, tag, attribute):

diff --git a/wpull/scraper/html_test.py b/wpull/scraper/html_test.py
@@ -529,3 +529,67 @@ def get_html_parser(self):
 class TestHTML5LibHTMLScraper(Mixin, unittest.TestCase):
     def get_html_parser(self):
         return HTML5LibHTMLParser()
+
+
+class TestElementWalker(unittest.TestCase):
+    _srcset_tests = (
+        (
+            'https://example.com/one 1x, https://example.com/two 2x',
+            (
+                'https://example.com/one',
+                'https://example.com/two',
+            ),
+        ),
+        (
+            'https://example.com/one 100w, https://example.com/two 200w',
+            (
+                'https://example.com/one',
+                'https://example.com/two',
+            ),
+        ),
+        (
+            (
+                'https://example.com/w=100,h=100/why.jpg 100w,'
+                'https://example.com/w=200,h=200/why.jpg 200w'
+            ),
+            (
+                'https://example.com/w=100,h=100/why.jpg',
+                'https://example.com/w=200,h=200/why.jpg',
+            ),
+        ),
+        (
+            (
+                'https://example.com/IAmAMassiveAsshole\t\t2e2x,\n'
+                'https://example.com/AndIHateYou\r\n-10x, '
+            ),
+            (
+                'https://example.com/IAmAMassiveAsshole',
+                'https://example.com/AndIHateYou',
+            ),
+        ),
+        (
+            (
+                ',,,https://example.com/it_is_wild_how_this_is_valid,,,'
+                'https://example.com/lmfao,,,,,'
+            ),
+            (
+                'https://example.com/it_is_wild_how_this_is_valid',
+                'https://example.com/lmfao',
+            ),
+        ),
+        (
+            ',,,,,',
+            (),
+        ),
+        (
+            '',
+            (),
+        ),
+    )
+    def test_iter_links_by_srcset_attrib(self):
+        for srcset, expected in self._srcset_tests:
+            actual = tuple(
+                value for _, value in
+                self.iter_links_by_srcset_attrib('srcset', srcset)
+            )
+            self.assertEqual(actual, expected)