chore: remote waf fingerprinting base

Merge pull request #2595 from obiwan04kanobi/feature/issue-2196-ci-docker-build-test
Add Docker build test to CI workflow (#2196)
2025-10-04 23:54:29 -04:00 · 2025-10-04 21:09:04 -04:00 · 2025-10-04 20:52:40 -04:00 · 2025-10-04 20:36:33 -04:00 · 2025-10-04 20:23:07 -04:00 · 2025-10-05 00:22:12 +05:30
7 changed files with 246 additions and 71 deletions
@@ -11,6 +11,7 @@ on:
      - '**/*.py'
      - '**/*.ini'
      - '**/*.toml'
+      - 'Dockerfile'
  push:
    branches:
      - master
@@ -21,11 +22,13 @@ on:
      - '**/*.py'
      - '**/*.ini'
      - '**/*.toml'
+      - 'Dockerfile'

 jobs:
  tox-lint:
-    # Linting is ran through tox to ensure that the same linter is used by local runners
    runs-on: ubuntu-latest
+    # Linting is ran through tox to ensure that the same linter
+    # is used by local runners
    steps:
      - uses: actions/checkout@v4
      - name: Set up linting environment
@@ -41,7 +44,8 @@ jobs:
  tox-matrix:
    runs-on: ${{ matrix.os }}
    strategy:
-      fail-fast: false # We want to know what specicic versions it fails on
+      # We want to know what specicic versions it fails on
+      fail-fast: false
      matrix:
        os: [
          ubuntu-latest,
@@ -67,3 +71,22 @@ jobs:
          pip install tox-gh-actions
      - name: Run tox
        run: tox
+  docker-build-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Get version from pyproject.toml
+        id: get-version
+        run: |
+          VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+      - name: Build Docker image
+        run: |
+          docker build \
+            --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
+            -t sherlock-test:latest .
+      - name: Test Docker image runs
+        run: docker run --rm sherlock-test:latest --version
@@ -4,7 +4,7 @@
  # 3. Build image with BOTH latest and version tags
    # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`

-FROM python:3.12-slim-bullseye as build
+FROM python:3.12-slim-bullseye AS build
 WORKDIR /sherlock

 RUN pip3 install --no-cache-dir --upgrade pip
@@ -46,13 +46,10 @@ PySocks = "^1.7.0"
 requests = "^2.22.0"
 requests-futures = "^1.0.0"
 stem = "^1.8.0"
-torrequest = "^0.1.0"
 pandas = "^2.2.1"
 openpyxl = "^3.0.10"
 tomli = "^2.2.1"
-
-[tool.poetry.extras]
-tor = ["torrequest"]
+pyyaml = "^6.0.3"

 [tool.poetry.group.dev.dependencies]
 jsonschema = "^4.0.0"
@@ -171,8 +171,6 @@ def sherlock(
    username: str,
    site_data: dict[str, dict[str, str]],
    query_notify: QueryNotify,
-    tor: bool = False,
-    unique_tor: bool = False,
    dump_response: bool = False,
    proxy: Optional[str] = None,
    timeout: int = 60,
@@ -188,8 +186,6 @@ def sherlock(
    query_notify           -- Object with base type of QueryNotify().
                              This will be used to notify the caller about
                              query results.
-    tor                    -- Boolean indicating whether to use a tor circuit for the requests.
-    unique_tor             -- Boolean indicating whether to use a new tor circuit for each request.
    proxy                  -- String indicating the proxy URL
    timeout                -- Time in seconds to wait before timing out request.
                              Default is 60 seconds.
@@ -210,32 +206,9 @@ def sherlock(

    # Notify caller that we are starting the query.
    query_notify.start(username)
-    # Create session based on request methodology
-    if tor or unique_tor:
-        try:
-            from torrequest import TorRequest  # noqa: E402
-        except ImportError:
-            print("Important!")
-            print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
-            print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.")
-            print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n")
-            sys.exit(query_notify.finish())

-        print("Important!")
-        print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
-
-        # Requests using Tor obfuscation
-        try:
-            underlying_request = TorRequest()
-        except OSError:
-            print("Tor not found in system path. Unable to continue.\n")
-            sys.exit(query_notify.finish())
-
-        underlying_session = underlying_request.session
-    else:
-        # Normal requests
-        underlying_session = requests.session()
-        underlying_request = requests.Request()
+    # Normal requests
+    underlying_session = requests.session()

    # Limit number of workers to 20.
    # This is probably vastly overkill.
@@ -359,15 +332,10 @@ def sherlock(
            # Store future in data for access later
            net_info["request_future"] = future

-            # Reset identify for tor (if needed)
-            if unique_tor:
-                underlying_request.reset_identity()
-
        # Add this site's results into final dictionary with all the other results.
        results_total[social_network] = results_site

    # Open the file containing account links
-    # Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
    for social_network, net_info in site_data.items():
        # Retrieve results again
        results_site = results_total.get(social_network)
@@ -600,22 +568,6 @@ def main():
        dest="output",
        help="If using single username, the output of the result will be saved to this file.",
    )
-    parser.add_argument(
-        "--tor",
-        "-t",
-        action="store_true",
-        dest="tor",
-        default=False,
-        help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.",
-    )
-    parser.add_argument(
-        "--unique-tor",
-        "-u",
-        action="store_true",
-        dest="unique_tor",
-        default=False,
-        help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.",
-    )
    parser.add_argument(
        "--csv",
        action="store_true",
@@ -769,22 +721,10 @@ def main():
    except Exception as error:
        print(f"A problem occurred while checking for an update: {error}")

-    # Argument check
-    # TODO regex check on args.proxy
-    if args.tor and (args.proxy is not None):
-        raise Exception("Tor and Proxy cannot be set at the same time.")
-
    # Make prompts
    if args.proxy is not None:
        print("Using the proxy: " + args.proxy)

-    if args.tor or args.unique_tor:
-        print("Using Tor to make requests")
-
-        print(
-            "Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors."
-        )
-
    if args.no_color:
        # Disable color output.
        init(strip=True, convert=False)
@@ -885,8 +825,6 @@ def main():
            username,
            site_data,
            query_notify,
-            tor=args.tor,
-            unique_tor=args.unique_tor,
            dump_response=args.dump_response,
            proxy=args.proxy,
            timeout=args.timeout,
@@ -0,0 +1,84 @@
+import requests
+import yaml
+
+
+NUCLEI_FINGERPRINT_URL: str = "https://raw.githubusercontent.com/projectdiscovery/nuclei-templates/refs/heads/main/http/global-matchers/global-waf-detect.yaml"
+
+def _check_nuclei_regex(matcher: dict[str,str|list[str]], response: requests.Response) -> bool:
+    import re
+
+    and_cond: bool = matcher.get('condition', '') == 'and'
+
+    target_text: str
+    if matcher['part'] == 'body':
+        target_text = response.text
+    elif matcher['part'] == 'header':
+        target_text = str(response.headers)
+    else:
+        target_text = response.text + str(response.headers)
+
+    for regex in matcher['regex']:
+        if re.search(regex, target_text):
+            if not and_cond:
+                return True
+        else:
+            break
+    else:
+        # `and` conditions will cycle, resulting in this default return True
+        # unless an early failed detection breaks the loop (resulting in False)
+        return True
+    return False
+
+def _check_nuclei_words(matcher: dict[str,str|list[str]], response: requests.Response) -> bool:
+    and_cond: bool = matcher.get('condition', '') == 'and'
+
+    target_text: str
+    if matcher['part'] == 'body':
+        target_text = response.text
+    elif matcher['part'] == 'header':
+        target_text = str(response.headers)
+    else:
+        target_text = response.text + str(response.headers)
+
+    for word in matcher['words']:
+        if word in target_text:
+            if not and_cond:
+                return True
+        else:
+            break
+    else:
+        # `and` conditions will cycle, resulting in this default return True
+        # unless an early failed detection breaks the loop (resulting in False)
+        return True
+    return False
+
+def fetch_nuclei_fingerprints() -> list[dict[str,str|list[str]]] | None:
+    """Fetch the latest Nuclei WAF fingerprints from the official repository."""
+    try:
+        response = requests.get(NUCLEI_FINGERPRINT_URL, timeout=10)
+        response.raise_for_status()
+        raw = yaml.safe_load(response.text)
+        fingerprints: list[dict[str,str|list[str]]] = raw['http'][0]['matchers']
+        return fingerprints
+    except requests.RequestException as e:
+        print(f"Error fetching Nuclei fingerprints: {e}")
+        return None
+    except yaml.YAMLError as e:
+        print(f"Error parsing YAML data: {e}")
+        return None
+
+def nuclei_check(response: requests.Response, fingerprints: list[dict[str,str|list[str]]]) -> bool:
+    """Check if the response matches any of the WAF fingerprints.
+
+    Keyword arguments:
+    response -- The HTTP response to check.
+    fingerprints -- The list of Nuclei WAF fingerprints to check against.
+
+    Returns True if a WAF is detected, False otherwise.
+    """
+    for matcher in fingerprints:
+        if matcher['type'] == 'word':
+            return _check_nuclei_words(matcher, response)
+        elif matcher['type'] == 'regex':
+            return _check_nuclei_regex(matcher, response)
+    return False
@@ -0,0 +1,26 @@
+id: global-waf-detect
+http:
+  - global-matchers: true
+    matchers-condition: or
+    matchers:
+      - type: regex
+        name: regexSite
+        regex:
+          - '(?i)access.to.this.page.has.been.denied'
+          - '(?i)http(s)?://(www.)?anotheroneblocked.\w+.whywasiblocked'
+        condition: or
+        part: response
+
+      - type: word
+        name: wordSiteBody
+        part: body
+        words:
+          - "bad_text_in_body"
+
+      - type: word
+        name: wordSiteHead
+        part: header
+        condition: or
+        words:
+          - "text_in_head"
+          - "other_in_head"
@@ -0,0 +1,107 @@
+import os
+import unittest
+from unittest.mock import patch, Mock
+import requests
+from requests.structures import CaseInsensitiveDict
+import yaml
+
+from sherlock_project import waf_check
+
+
+TEMPLATE_BODY_PATH: str = os.path.join(os.path.dirname(__file__), 'mocks', 'global_waf_detect.yaml')
+
+def side_effect(url, **kwargs) -> Mock:
+    if url == waf_check.NUCLEI_FINGERPRINT_URL:
+        with open(TEMPLATE_BODY_PATH, 'r', encoding='utf-8') as file:
+            template_body: str = file.read()
+        mock_response: Mock = Mock()
+        mock_response.status_code = 200
+        mock_response.text = template_body
+        return mock_response
+    raise RuntimeError("Unexpected URL")
+
+class TestWafCheck(unittest.TestCase):
+
+    @patch('sherlock_project.waf_check.requests.get')
+    def test_fetch_nuclei_fingerprints(self, mock_requests_get): # type: ignore
+        mock_requests_get.side_effect = side_effect
+
+        result = waf_check.fetch_nuclei_fingerprints()
+
+        with open(TEMPLATE_BODY_PATH, 'r', encoding='utf-8') as file:
+            template_body: str = file.read()
+
+        expected: list[dict[str, str | list[str]]] = yaml.safe_load(template_body)['http'][0]['matchers']
+        self.assertEqual(result, expected)
+
+    def test_nuclei_regex_check(self):
+        mock_res: requests.Response = requests.Response()
+        mock_res.status_code = 200
+        mock_res._content = b"This is a test response with Test-Regex in the body."
+        mock_res.headers = CaseInsensitiveDict({
+            'Content-Type': 'text/html',
+            'Server': 'TestServer'
+        })
+        matcher: dict[str, str | list[str]] = {
+            'type': 'regex',
+            'name': 'test-regex',
+            'part': 'body',
+            'regex': [r'(?i)not-present'],
+            'condition': 'or'
+        }
+        self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['regex'] = [r'(?i)TeSt-REgEx']
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['regex'] = [r'(?i)TeSt-REgEx', r'(?i)Not-Present']
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['condition'] = 'and'
+        self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'header'
+        matcher['regex'] = [r'(?i)testserver']
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'response'
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['regex'] = [r'(?i)not-present']
+        self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+    def test_nuclei_words_check(self):
+        mock_res: requests.Response = requests.Response()
+        mock_res.status_code = 200
+        mock_res._content = b"This is a test response with test-words in the body."
+        mock_res.headers = CaseInsensitiveDict({
+            'Content-Type': 'text/html',
+            'Server': 'TestServer'
+        })
+        matcher: dict[str, str | list[str]] = {
+            'type': 'word',
+            'name': 'test-word',
+            'part': 'body',
+            'words': ['not-present'],
+            'condition': 'or'
+        }
+        self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['words'] = ['test-word']
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['words'] = ['test-word', 'Not-Present']
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['condition'] = 'and'
+        self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'header'
+        matcher['words'] = ['testserver']
+        self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['words'] = ['TestServer']
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'response'
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
Author	SHA1	Message	Date
Paul Pfeister	d25848cc5f	chore: remote waf fingerprinting base	2025-10-04 23:54:29 -04:00
Paul Pfeister	f32f4ffaee	Merge pull request #2595 from obiwan04kanobi/feature/issue-2196-ci-docker-build-test Add Docker build test to CI workflow (#2196)	2025-10-04 21:09:04 -04:00
Paul Pfeister	7379ba7b19	Merge branch 'remove-tor'	2025-10-04 20:52:40 -04:00
Paul Pfeister	3aeb6d6356	Merge pull request #2602 from sherlock-project/feat/no-txt chore: make default --no-txt	2025-10-04 20:36:33 -04:00
Paul Pfeister	e44fe49c8f	Merge pull request #2601 from sherlock-project/feat/graceful-skip feat: gracefully skip sites with invalid errorType	2025-10-04 20:23:07 -04:00
obiwan04kanobi	b99719ce60	Add Docker build test to CI workflow - Adds docker-build-test job to regression.yml - Runs on push/merge to master and release branches - Extracts VERSION_TAG from pyproject.toml for build - Tests that Docker image builds and runs successfully - Resolves dockerfile syntax warnings - Resolves #2196"	2025-10-05 00:22:12 +05:30
Paul Pfeister	2016892e64	Remove torrequest dep Not sure why it's not in my patch file, but I was removing via sed in my spec instead.	2024-06-28 23:39:38 -04:00
Paul Pfeister	44ad8f506a	Lint	2024-06-28 23:38:44 -04:00
Siddharth Dushantha	cfa4097df9	removed support for tor	2024-06-26 21:57:11 +02:00