feat(scraper): add ScraperError + _dump_html + _find_input_value helpers
This commit is contained in:
parent
9ec0d2ade4
commit
b7bc534681
@ -3,8 +3,15 @@ import requests, re
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
||||
# with open('security_response.html', 'wb') as f:
|
||||
# f.write(response.content)
|
||||
|
||||
class ScraperError(Exception):
|
||||
"""A cm99.net response did not contain the field we expected.
|
||||
|
||||
The raw response is saved to logs/scraper-failures/ before this is
|
||||
raised; the message identifies which method failed and what was
|
||||
being looked for.
|
||||
"""
|
||||
|
||||
|
||||
class CM_BOT:
|
||||
def __init__(self):
|
||||
@ -202,6 +209,41 @@ class CM_BOT:
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
def _dump_html(self, context: str, content) -> str:
|
||||
"""Save a failing cm99.net response to logs/scraper-failures/.
|
||||
|
||||
Returns the path written to so callers can include it in error
|
||||
messages.
|
||||
"""
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
out_dir = os.path.join("logs", "scraper-failures")
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
path = os.path.join(out_dir, f"{context}-{ts}.html")
|
||||
if isinstance(content, (bytes, bytearray)):
|
||||
data = bytes(content)
|
||||
else:
|
||||
data = str(content).encode("utf-8", "replace")
|
||||
with open(path, "wb") as f:
|
||||
f.write(data)
|
||||
print(f"[scraper-failure] dumped {context} response to {path}")
|
||||
return path
|
||||
|
||||
def _find_input_value(self, soup, ident: str, *, context: str, raw, by: str = "name") -> str:
|
||||
"""Extract <input {by}=IDENT value=...>'s value or raise ScraperError.
|
||||
|
||||
`by` selects between matching <input name=...> (default) and
|
||||
<input id=...>. Saves the raw response to logs/scraper-failures/
|
||||
before raising so the operator can postmortem.
|
||||
"""
|
||||
el = soup.find("input", {by: ident})
|
||||
if el is None or "value" not in el.attrs:
|
||||
path = self._dump_html(context, raw)
|
||||
raise ScraperError(
|
||||
f"{context}: input[{by}={ident!r}] missing or has no value attribute "
|
||||
f"(response saved to {path})"
|
||||
)
|
||||
return el["value"]
|
||||
|
||||
def get_register_data(self, token: str, username: str, password: str):
|
||||
return {
|
||||
'struts.token.name': 'token',
|
||||
|
||||
119
tests/test_cm_bot_scraper.py
Normal file
119
tests/test_cm_bot_scraper.py
Normal file
@ -0,0 +1,119 @@
|
||||
"""Tests for the cm_bot scraper resilience helpers.
|
||||
|
||||
The CM_BOT class currently uses bare `soup.find(...)['value']` calls
|
||||
that throw cryptic TypeErrors when cm99.net returns an unexpected
|
||||
response. R3 introduces three pieces:
|
||||
- ScraperError: typed exception so callers can distinguish scraper
|
||||
failures from network errors.
|
||||
- _dump_html(context, content): writes the failing response to
|
||||
logs/scraper-failures/<context>-<ts>.html and returns the path.
|
||||
- _find_input_value(soup, ident, *, context, raw, by='name'):
|
||||
the dominant extraction pattern. Returns the value on success,
|
||||
dumps + raises ScraperError on miss.
|
||||
|
||||
These tests do NOT exercise the live cm99.net integration. They use
|
||||
small inline HTML fixtures and patch filesystem side effects so the
|
||||
tests stay hermetic.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import mock
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app.cm_bot import CM_BOT, ScraperError
|
||||
|
||||
|
||||
class ScraperHelpersTests(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# CM_BOT.__init__ reads CM_BOT_BASE_URL from the env. Patch it
|
||||
# in setUp (mock.patch.dict as a class decorator only wraps
|
||||
# test_* methods, so setUp would see an unpatched env).
|
||||
self._env_patcher = mock.patch.dict(
|
||||
os.environ, {"CM_BOT_BASE_URL": "https://example.invalid"}
|
||||
)
|
||||
self._env_patcher.start()
|
||||
self.addCleanup(self._env_patcher.stop)
|
||||
|
||||
# Each test gets a fresh tmpdir so the dump helper writes
|
||||
# somewhere predictable. We chdir into it for the duration of
|
||||
# the test because _dump_html writes to a relative
|
||||
# logs/scraper-failures path.
|
||||
self._old_cwd = os.getcwd()
|
||||
self._tmp = tempfile.mkdtemp(prefix="r3-test-")
|
||||
os.chdir(self._tmp)
|
||||
self.bot = CM_BOT()
|
||||
|
||||
def tearDown(self):
|
||||
os.chdir(self._old_cwd)
|
||||
shutil.rmtree(self._tmp, ignore_errors=True)
|
||||
|
||||
# ---- _dump_html ----
|
||||
|
||||
def test_dump_html_creates_dir_and_writes_bytes(self):
|
||||
path = self.bot._dump_html("ctx-test", b"<html>hi</html>")
|
||||
self.assertTrue(os.path.isfile(path), f"file should exist: {path}")
|
||||
with open(path, "rb") as f:
|
||||
self.assertEqual(f.read(), b"<html>hi</html>")
|
||||
self.assertTrue(path.startswith(os.path.join("logs", "scraper-failures")))
|
||||
|
||||
def test_dump_html_accepts_str_content(self):
|
||||
path = self.bot._dump_html("ctx-test", "<html>hi</html>")
|
||||
with open(path, "rb") as f:
|
||||
self.assertEqual(f.read(), b"<html>hi</html>")
|
||||
|
||||
def test_dump_html_includes_context_and_timestamp_in_filename(self):
|
||||
path = self.bot._dump_html("register_form_token", b"x")
|
||||
basename = os.path.basename(path)
|
||||
self.assertTrue(basename.startswith("register_form_token-"), basename)
|
||||
self.assertTrue(basename.endswith(".html"), basename)
|
||||
|
||||
# ---- _find_input_value ----
|
||||
|
||||
def test_find_input_value_returns_value_when_present(self):
|
||||
html = '<form><input name="token" value="abc123"></form>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
result = self.bot._find_input_value(
|
||||
soup, "token", context="happy_path", raw=html.encode()
|
||||
)
|
||||
self.assertEqual(result, "abc123")
|
||||
|
||||
def test_find_input_value_raises_and_dumps_when_missing(self):
|
||||
html = '<form><input name="other" value="x"></form>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
with self.assertRaises(ScraperError) as cm:
|
||||
self.bot._find_input_value(
|
||||
soup, "token", context="missing_input", raw=html.encode()
|
||||
)
|
||||
msg = str(cm.exception)
|
||||
self.assertIn("missing_input", msg)
|
||||
self.assertIn("token", msg)
|
||||
dumped = os.listdir(os.path.join("logs", "scraper-failures"))
|
||||
self.assertEqual(len(dumped), 1, f"expected one dump, got {dumped}")
|
||||
self.assertTrue(dumped[0].startswith("missing_input-"))
|
||||
|
||||
def test_find_input_value_raises_when_input_has_no_value_attr(self):
|
||||
html = '<form><input name="token"></form>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
with self.assertRaises(ScraperError):
|
||||
self.bot._find_input_value(
|
||||
soup, "token", context="no_value_attr", raw=html.encode()
|
||||
)
|
||||
|
||||
def test_find_input_value_does_not_dump_on_success(self):
|
||||
html = '<form><input name="token" value="abc"></form>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
self.bot._find_input_value(
|
||||
soup, "token", context="should_not_dump", raw=html.encode()
|
||||
)
|
||||
self.assertFalse(
|
||||
os.path.isdir(os.path.join("logs", "scraper-failures")),
|
||||
"happy path should not create the failure dir",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
x
Reference in New Issue
Block a user