feat(scraper): add ScraperError + _dump_html + _find_input_value helpers

This commit is contained in:
yiekheng 2026-05-02 17:54:21 +08:00
parent 9ec0d2ade4
commit b7bc534681
2 changed files with 163 additions and 2 deletions

View File

@ -3,8 +3,15 @@ import requests, re
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import os import os
# with open('security_response.html', 'wb') as f:
# f.write(response.content) class ScraperError(Exception):
"""A cm99.net response did not contain the field we expected.
The raw response is saved to logs/scraper-failures/ before this is
raised; the message identifies which method failed and what was
being looked for.
"""
class CM_BOT: class CM_BOT:
def __init__(self): def __init__(self):
@ -202,6 +209,41 @@ class CM_BOT:
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36' 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36'
} }
def _dump_html(self, context: str, content) -> str:
"""Save a failing cm99.net response to logs/scraper-failures/.
Returns the path written to so callers can include it in error
messages.
"""
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
out_dir = os.path.join("logs", "scraper-failures")
os.makedirs(out_dir, exist_ok=True)
path = os.path.join(out_dir, f"{context}-{ts}.html")
if isinstance(content, (bytes, bytearray)):
data = bytes(content)
else:
data = str(content).encode("utf-8", "replace")
with open(path, "wb") as f:
f.write(data)
print(f"[scraper-failure] dumped {context} response to {path}")
return path
def _find_input_value(self, soup, ident: str, *, context: str, raw, by: str = "name") -> str:
"""Extract <input {by}=IDENT value=...>'s value or raise ScraperError.
`by` selects between matching <input name=...> (default) and
<input id=...>. Saves the raw response to logs/scraper-failures/
before raising so the operator can postmortem.
"""
el = soup.find("input", {by: ident})
if el is None or "value" not in el.attrs:
path = self._dump_html(context, raw)
raise ScraperError(
f"{context}: input[{by}={ident!r}] missing or has no value attribute "
f"(response saved to {path})"
)
return el["value"]
def get_register_data(self, token: str, username: str, password: str): def get_register_data(self, token: str, username: str, password: str):
return { return {
'struts.token.name': 'token', 'struts.token.name': 'token',

View File

@ -0,0 +1,119 @@
"""Tests for the cm_bot scraper resilience helpers.
The CM_BOT class currently uses bare `soup.find(...)['value']` calls
that throw cryptic TypeErrors when cm99.net returns an unexpected
response. R3 introduces three pieces:
- ScraperError: typed exception so callers can distinguish scraper
failures from network errors.
- _dump_html(context, content): writes the failing response to
logs/scraper-failures/<context>-<ts>.html and returns the path.
- _find_input_value(soup, ident, *, context, raw, by='name'):
the dominant extraction pattern. Returns the value on success,
dumps + raises ScraperError on miss.
These tests do NOT exercise the live cm99.net integration. They use
small inline HTML fixtures and patch filesystem side effects so the
tests stay hermetic.
"""
import os
import shutil
import tempfile
import unittest
from unittest import mock
from bs4 import BeautifulSoup
from app.cm_bot import CM_BOT, ScraperError
class ScraperHelpersTests(unittest.TestCase):
def setUp(self):
# CM_BOT.__init__ reads CM_BOT_BASE_URL from the env. Patch it
# in setUp (mock.patch.dict as a class decorator only wraps
# test_* methods, so setUp would see an unpatched env).
self._env_patcher = mock.patch.dict(
os.environ, {"CM_BOT_BASE_URL": "https://example.invalid"}
)
self._env_patcher.start()
self.addCleanup(self._env_patcher.stop)
# Each test gets a fresh tmpdir so the dump helper writes
# somewhere predictable. We chdir into it for the duration of
# the test because _dump_html writes to a relative
# logs/scraper-failures path.
self._old_cwd = os.getcwd()
self._tmp = tempfile.mkdtemp(prefix="r3-test-")
os.chdir(self._tmp)
self.bot = CM_BOT()
def tearDown(self):
os.chdir(self._old_cwd)
shutil.rmtree(self._tmp, ignore_errors=True)
# ---- _dump_html ----
def test_dump_html_creates_dir_and_writes_bytes(self):
path = self.bot._dump_html("ctx-test", b"<html>hi</html>")
self.assertTrue(os.path.isfile(path), f"file should exist: {path}")
with open(path, "rb") as f:
self.assertEqual(f.read(), b"<html>hi</html>")
self.assertTrue(path.startswith(os.path.join("logs", "scraper-failures")))
def test_dump_html_accepts_str_content(self):
path = self.bot._dump_html("ctx-test", "<html>hi</html>")
with open(path, "rb") as f:
self.assertEqual(f.read(), b"<html>hi</html>")
def test_dump_html_includes_context_and_timestamp_in_filename(self):
path = self.bot._dump_html("register_form_token", b"x")
basename = os.path.basename(path)
self.assertTrue(basename.startswith("register_form_token-"), basename)
self.assertTrue(basename.endswith(".html"), basename)
# ---- _find_input_value ----
def test_find_input_value_returns_value_when_present(self):
html = '<form><input name="token" value="abc123"></form>'
soup = BeautifulSoup(html, "html.parser")
result = self.bot._find_input_value(
soup, "token", context="happy_path", raw=html.encode()
)
self.assertEqual(result, "abc123")
def test_find_input_value_raises_and_dumps_when_missing(self):
html = '<form><input name="other" value="x"></form>'
soup = BeautifulSoup(html, "html.parser")
with self.assertRaises(ScraperError) as cm:
self.bot._find_input_value(
soup, "token", context="missing_input", raw=html.encode()
)
msg = str(cm.exception)
self.assertIn("missing_input", msg)
self.assertIn("token", msg)
dumped = os.listdir(os.path.join("logs", "scraper-failures"))
self.assertEqual(len(dumped), 1, f"expected one dump, got {dumped}")
self.assertTrue(dumped[0].startswith("missing_input-"))
def test_find_input_value_raises_when_input_has_no_value_attr(self):
html = '<form><input name="token"></form>'
soup = BeautifulSoup(html, "html.parser")
with self.assertRaises(ScraperError):
self.bot._find_input_value(
soup, "token", context="no_value_attr", raw=html.encode()
)
def test_find_input_value_does_not_dump_on_success(self):
html = '<form><input name="token" value="abc"></form>'
soup = BeautifulSoup(html, "html.parser")
self.bot._find_input_value(
soup, "token", context="should_not_dump", raw=html.encode()
)
self.assertFalse(
os.path.isdir(os.path.join("logs", "scraper-failures")),
"happy path should not create the failure dir",
)
if __name__ == "__main__":
unittest.main()