chore: remote waf fingerprinting base

Merge pull request #2595 from obiwan04kanobi/feature/issue-2196-ci-docker-build-test
Add Docker build test to CI workflow (#2196)
2025-10-04 23:54:29 -04:00 · 2025-10-04 21:09:04 -04:00 · 2025-10-04 20:52:40 -04:00 · 2025-10-04 20:36:33 -04:00 · 2025-10-04 20:32:16 -04:00 · 2025-10-04 20:23:07 -04:00
11 changed files with 513 additions and 230 deletions
@@ -1,5 +1,5 @@
 ### REPOSITORY
-/.github/CODEOWNERS @sdushantha
+/.github/CODEOWNERS @sdushantha @ppfeister
 /.github/FUNDING.yml @sdushantha
 /LICENSE @sdushantha

@@ -11,6 +11,7 @@ on:
      - '**/*.py'
      - '**/*.ini'
      - '**/*.toml'
+      - 'Dockerfile'
  push:
    branches:
      - master
@@ -21,11 +22,13 @@ on:
      - '**/*.py'
      - '**/*.ini'
      - '**/*.toml'
+      - 'Dockerfile'

 jobs:
  tox-lint:
-    # Linting is ran through tox to ensure that the same linter is used by local runners
    runs-on: ubuntu-latest
+    # Linting is ran through tox to ensure that the same linter
+    # is used by local runners
    steps:
      - uses: actions/checkout@v4
      - name: Set up linting environment
@@ -41,7 +44,8 @@ jobs:
  tox-matrix:
    runs-on: ${{ matrix.os }}
    strategy:
-      fail-fast: false # We want to know what specicic versions it fails on
+      # We want to know what specicic versions it fails on
+      fail-fast: false
      matrix:
        os: [
          ubuntu-latest,
@@ -67,3 +71,22 @@ jobs:
          pip install tox-gh-actions
      - name: Run tox
        run: tox
+  docker-build-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Get version from pyproject.toml
+        id: get-version
+        run: |
+          VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+      - name: Build Docker image
+        run: |
+          docker build \
+            --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
+            -t sherlock-test:latest .
+      - name: Test Docker image runs
+        run: docker run --rm sherlock-test:latest --version
@@ -4,7 +4,7 @@
  # 3. Build image with BOTH latest and version tags
    # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`

-FROM python:3.12-slim-bullseye as build
+FROM python:3.12-slim-bullseye AS build
 WORKDIR /sherlock

 RUN pip3 install --no-cache-dir --upgrade pip
@@ -46,13 +46,10 @@ PySocks = "^1.7.0"
 requests = "^2.22.0"
 requests-futures = "^1.0.0"
 stem = "^1.8.0"
-torrequest = "^0.1.0"
 pandas = "^2.2.1"
 openpyxl = "^3.0.10"
 tomli = "^2.2.1"
-
-[tool.poetry.extras]
-tor = ["torrequest"]
+pyyaml = "^6.0.3"

 [tool.poetry.group.dev.dependencies]
 jsonschema = "^4.0.0"
@@ -79,13 +79,13 @@
    "username_claimed": "pink"
  },
  "AllMyLinks": {
-    "errorMsg": "Not Found",
-    "errorType": "message",
-    "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$",
-    "url": "https://allmylinks.com/{}",
-    "urlMain": "https://allmylinks.com/",
-    "username_claimed": "blue"
-  },
+  "errorMsg": "Page not found",
+  "errorType": "message",
+  "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$",
+  "url": "https://allmylinks.com/{}",
+  "urlMain": "https://allmylinks.com/",
+  "username_claimed": "blue"
+},
  "AniWorld": {
    "errorMsg": "Dieses Profil ist nicht verf\u00fcgbar",
    "errorType": "message",
@@ -115,7 +115,7 @@
    "username_claimed": "lio24d"
  },
  "Apple Discussions": {
-    "errorMsg": "The page you tried was not found. You may have used an outdated link or may have typed the address (URL) incorrectly.",
+    "errorMsg": "Looking for something in Apple Support Communities?",
    "errorType": "message",
    "url": "https://discussions.apple.com/profile/{}",
    "urlMain": "https://discussions.apple.com",
@@ -572,8 +572,7 @@
    "username_claimed": "brown"
  },
  "CyberDefenders": {
-    "errorMsg": "<title>Blue Team Training for SOC analysts and DFIR - CyberDefenders</title>",
-    "errorType": "message",
+    "errorType": "status_code",
    "regexCheck": "^[^\\/:*?\"<>|@]{3,50}$",
    "request_method": "GET",
    "url": "https://cyberdefenders.org/p/{}",
@@ -600,6 +599,12 @@
    "urlMain": "https://www.dailymotion.com/",
    "username_claimed": "blue"
  },
+  "dcinside": {
+    "errorType": "status_code",
+    "url": "https://gallog.dcinside.com/{}",
+    "urlMain": "https://www.dcinside.com/",
+    "username_claimed": "anrbrb"
+  },
  "Dealabs": {
    "errorMsg": "La page que vous essayez",
    "errorType": "message",
@@ -608,13 +613,14 @@
    "urlMain": "https://www.dealabs.com/",
    "username_claimed": "blue"
  },
-  "DeviantART": {
-    "errorType": "status_code",
-    "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
-    "url": "https://{}.deviantart.com",
-    "urlMain": "https://deviantart.com",
-    "username_claimed": "blue"
-  },
+ "DeviantArt": {
+  "errorType": "message",
+  "errorMsg": "Llama Not Found",
+  "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
+  "url": "https://www.deviantart.com/{}",
+  "urlMain": "https://www.deviantart.com/",
+  "username_claimed": "blue"
+},
  "DigitalSpy": {
      "errorMsg": "The page you were looking for could not be found.",
      "errorType": "message",
@@ -1440,12 +1446,12 @@
    "username_claimed": "blue"
  },
  "Mydramalist": {
-    "errorMsg": "Sign in - MyDramaList",
-    "errorType": "message",
-    "url": "https://www.mydramalist.com/profile/{}",
-    "urlMain": "https://mydramalist.com",
-    "username_claimed": "elhadidy12398"
-  },
+  "errorMsg": "The requested page was not found",
+  "errorType": "message",
+  "url": "https://www.mydramalist.com/profile/{}",
+  "urlMain": "https://mydramalist.com",
+  "username_claimed": "elhadidy12398"
+},
  "Myspace": {
    "errorType": "status_code",
    "url": "https://myspace.com/{}",
@@ -1459,6 +1465,13 @@
    "urlMain": "https://www.native-instruments.com/forum/",
    "username_claimed": "jambert"
  },
+  "namuwiki": {
+    "__comment__": "This is a Korean site and it's expected to return false negatives in certain other regions.",
+    "errorType": "status_code",
+    "url": "https://namu.wiki/w/%EC%82%AC%EC%9A%A9%EC%9E%90:{}",
+    "urlMain": "https://namu.wiki/",
+    "username_claimed": "namu"
+  },
  "NationStates Nation": {
    "errorMsg": "Was this your nation? It may have ceased to exist due to inactivity, but can rise again!",
    "errorType": "message",
@@ -1809,8 +1822,7 @@
    "username_claimed": "blue"
  },
  "Roblox": {
-    "errorMsg": "Page cannot be found or no longer exists",
-    "errorType": "message",
+    "errorType": "status_code",
    "url": "https://www.roblox.com/user.aspx?username={}",
    "urlMain": "https://www.roblox.com/",
    "username_claimed": "bluewolfekiller"
@@ -1918,7 +1930,7 @@
  },
  "SlideShare": {
    "errorType": "message",
-    "errorMsg": "<title>Username available</title>",
+    "errorMsg": "<title>Page no longer exists</title>",
    "url": "https://slideshare.net/{}",
    "urlMain": "https://slideshare.net/",
    "username_claimed": "blue"
@@ -1952,6 +1964,13 @@
    "urlMain": "https://www.snapchat.com",
    "username_claimed": "teamsnapchat"
  },
+  "SOOP": {
+    "errorType": "status_code",
+    "url": "https://www.sooplive.co.kr/station/{}",
+    "urlMain": "https://www.sooplive.co.kr/",
+    "urlProbe": "https://api-channel.sooplive.co.kr/v1.1/channel/{}/station",
+    "username_claimed": "udkn"
+  },
  "SoundCloud": {
    "errorType": "status_code",
    "url": "https://soundcloud.com/{}",
@@ -2119,6 +2138,12 @@
    "urlMain": "https://themeforest.net/",
    "username_claimed": "user"
  },
+  "tistory": {
+    "errorType": "status_code",
+    "url": "https://{}.tistory.com/",
+    "urlMain": "https://www.tistory.com/",
+    "username_claimed": "notice"
+  },
  "TnAFlix": {
    "errorType": "status_code",
    "isNSFW": true,
@@ -2793,7 +2818,7 @@
    "username_claimed": "green"
  },
  "threads": {
-    "errorMsg": "<title>Threads</title>",
+    "errorMsg": "<title>Threads • Log in</title>",
    "errorType": "message",
    "headers": {
      "Sec-Fetch-Mode": "navigate"
@@ -1,80 +1,149 @@
 {
-    "$schema": "https://json-schema.org/draft/2020-12/schema",
-    "title": "Sherlock Target Manifest",
-    "description": "Social media targets to probe for the existence of known usernames",
-    "type": "object",
-    "properties": {
-        "$schema": { "type": "string" }
-    },
-    "patternProperties": {
-        "^(?!\\$).*?$": {
-            "type": "object",
-            "description": "Target name and associated information (key should be human readable name)",
-            "required": [ "url", "urlMain", "errorType", "username_claimed" ],
-            "properties": {
-                "url": { "type": "string" },
-                "urlMain": { "type": "string" },
-                "urlProbe": { "type": "string" },
-                "username_claimed": { "type": "string" },
-                "regexCheck": { "type": "string" },
-                "isNSFW": { "type": "boolean" },
-                "headers": { "type": "object" },
-                "request_payload": { "type": "object" },
-                "__comment__": {
-                    "type": "string",
-                    "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
-                },
-                "tags": {
-                    "oneOf": [
-                        { "$ref": "#/$defs/tag" },
-                        { "type": "array", "items": { "$ref": "#/$defs/tag" } }
-                    ]
-                },
-                "request_method": {
-                    "type": "string",
-                    "enum": [ "GET", "POST", "HEAD", "PUT" ]
-                },
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "title": "Sherlock Target Manifest",
+  "description": "Social media targets to probe for the existence of known usernames",
+  "type": "object",
+  "properties": {
+    "$schema": { "type": "string" }
+  },
+  "patternProperties": {
+    "^(?!\\$).*?$": {
+      "type": "object",
+      "description": "Target name and associated information (key should be human readable name)",
+      "required": ["url", "urlMain", "errorType", "username_claimed"],
+      "properties": {
+        "url": { "type": "string" },
+        "urlMain": { "type": "string" },
+        "urlProbe": { "type": "string" },
+        "username_claimed": { "type": "string" },
+        "regexCheck": { "type": "string" },
+        "isNSFW": { "type": "boolean" },
+        "headers": { "type": "object" },
+        "request_payload": { "type": "object" },
+        "__comment__": {
+          "type": "string",
+          "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
+        },
+        "tags": {
+          "oneOf": [
+            { "$ref": "#/$defs/tag" },
+            { "type": "array", "items": { "$ref": "#/$defs/tag" } }
+          ]
+        },
+        "request_method": {
+          "type": "string",
+          "enum": ["GET", "POST", "HEAD", "PUT"]
+        },
+        "errorType": {
+          "oneOf": [
+            {
+              "type": "string",
+              "enum": ["message", "response_url", "status_code"]
+            },
+            {
+              "type": "array",
+              "items": {
+                "type": "string",
+                "enum": ["message", "response_url", "status_code"]
+              }
+            }
+          ]
+        },
+        "errorMsg": {
+          "oneOf": [
+            { "type": "string" },
+            { "type": "array", "items": { "type": "string" } }
+          ]
+        },
+        "errorCode": {
+          "oneOf": [
+            { "type": "integer" },
+            { "type": "array", "items": { "type": "integer" } }
+          ]
+        },
+        "errorUrl": { "type": "string" },
+        "response_url": { "type": "string" }
+      },
+      "dependencies": {
+        "errorMsg": {
+          "oneOf": [
+            { "properties": { "errorType": { "const": "message" } } },
+            {
+              "properties": {
                "errorType": {
-                    "type": "string",
-                    "enum": [ "message", "response_url", "status_code" ]
-                },
-                "errorMsg": {
-                    "oneOf": [
-                        { "type": "string" },
-                        { "type": "array", "items": { "type": "string" } }
-                    ]
-                },
-                "errorCode": {
-                    "oneOf": [
-                        { "type": "integer" },
-                        { "type": "array", "items": { "type": "integer" } }
-                    ]
-                },
-                "errorUrl": { "type": "string" },
-                "response_url": { "type": "string" }
-            },
-            "dependencies": {
-                "errorMsg": {
-                    "properties" : { "errorType": { "const": "message" } }
-                },
-                "errorUrl": {
-                    "properties": { "errorType": { "const": "response_url" } }
-                },
-                "errorCode": {
-                    "properties": { "errorType": { "const": "status_code" } }
+                  "type": "array",
+                  "contains": { "const": "message" }
                }
-            },
-            "if": { "properties": { "errorType": { "const": "message" } } },
-            "then": { "required": [ "errorMsg" ] },
-            "else": {
-                "if": { "properties": { "errorType": { "const": "response_url" } } },
-                "then": { "required": [ "errorUrl" ] }
-            },
-            "additionalProperties": false
+              }
+            }
+          ]
+        },
+        "errorUrl": {
+          "oneOf": [
+            { "properties": { "errorType": { "const": "response_url" } } },
+            {
+              "properties": {
+                "errorType": {
+                  "type": "array",
+                  "contains": { "const": "response_url" }
+                }
+              }
+            }
+          ]
+        },
+        "errorCode": {
+          "oneOf": [
+            { "properties": { "errorType": { "const": "status_code" } } },
+            {
+              "properties": {
+                "errorType": {
+                  "type": "array",
+                  "contains": { "const": "status_code" }
+                }
+              }
+            }
+          ]
        }
-    },
-    "additionalProperties": false,
-    "$defs": {
-        "tag": { "type": "string", "enum": [ "adult", "gaming" ] }
+      },
+      "allOf": [
+        {
+          "if": {
+            "anyOf": [
+              { "properties": { "errorType": { "const": "message" } } },
+              {
+                "properties": {
+                  "errorType": {
+                    "type": "array",
+                    "contains": { "const": "message" }
+                  }
+                }
+              }
+            ]
+          },
+          "then": { "required": ["errorMsg"] }
+        },
+        {
+          "if": {
+            "anyOf": [
+              { "properties": { "errorType": { "const": "response_url" } } },
+              {
+                "properties": {
+                  "errorType": {
+                    "type": "array",
+                    "contains": { "const": "response_url" }
+                  }
+                }
+              }
+            ]
+          },
+          "then": { "required": ["errorUrl"] }
+        }
+      ],
+      "additionalProperties": false
    }
+  },
+  "additionalProperties": false,
+  "$defs": {
+    "tag": { "type": "string", "enum": ["adult", "gaming"] }
+  }
 }
@@ -171,8 +171,6 @@ def sherlock(
    username: str,
    site_data: dict[str, dict[str, str]],
    query_notify: QueryNotify,
-    tor: bool = False,
-    unique_tor: bool = False,
    dump_response: bool = False,
    proxy: Optional[str] = None,
    timeout: int = 60,
@@ -188,8 +186,6 @@ def sherlock(
    query_notify           -- Object with base type of QueryNotify().
                              This will be used to notify the caller about
                              query results.
-    tor                    -- Boolean indicating whether to use a tor circuit for the requests.
-    unique_tor             -- Boolean indicating whether to use a new tor circuit for each request.
    proxy                  -- String indicating the proxy URL
    timeout                -- Time in seconds to wait before timing out request.
                              Default is 60 seconds.
@@ -210,32 +206,9 @@ def sherlock(

    # Notify caller that we are starting the query.
    query_notify.start(username)
-    # Create session based on request methodology
-    if tor or unique_tor:
-        try:
-            from torrequest import TorRequest  # noqa: E402
-        except ImportError:
-            print("Important!")
-            print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
-            print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.")
-            print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n")
-            sys.exit(query_notify.finish())

-        print("Important!")
-        print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
-
-        # Requests using Tor obfuscation
-        try:
-            underlying_request = TorRequest()
-        except OSError:
-            print("Tor not found in system path. Unable to continue.\n")
-            sys.exit(query_notify.finish())
-
-        underlying_session = underlying_request.session
-    else:
-        # Normal requests
-        underlying_session = requests.session()
-        underlying_request = requests.Request()
+    # Normal requests
+    underlying_session = requests.session()

    # Limit number of workers to 20.
    # This is probably vastly overkill.
@@ -359,15 +332,10 @@ def sherlock(
            # Store future in data for access later
            net_info["request_future"] = future

-            # Reset identify for tor (if needed)
-            if unique_tor:
-                underlying_request.reset_identity()
-
        # Add this site's results into final dictionary with all the other results.
        results_total[social_network] = results_site

    # Open the file containing account links
-    # Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
    for social_network, net_info in site_data.items():
        # Retrieve results again
        results_site = results_total.get(social_network)
@@ -381,6 +349,8 @@ def sherlock(

        # Get the expected error type
        error_type = net_info["errorType"]
+        if isinstance(error_type, str):
+            error_type: list[str] = [error_type]

        # Retrieve future and ensure it has finished
        future = net_info["request_future"]
@@ -425,58 +395,60 @@ def sherlock(
        elif any(hitMsg in r.text for hitMsg in WAFHitMsgs):
            query_status = QueryStatus.WAF

-        elif error_type == "message":
-            # error_flag True denotes no error found in the HTML
-            # error_flag False denotes error found in the HTML
-            error_flag = True
-            errors = net_info.get("errorMsg")
-            # errors will hold the error message
-            # it can be string or list
-            # by isinstance method we can detect that
-            # and handle the case for strings as normal procedure
-            # and if its list we can iterate the errors
-            if isinstance(errors, str):
-                # Checks if the error message is in the HTML
-                # if error is present we will set flag to False
-                if errors in r.text:
-                    error_flag = False
-            else:
-                # If it's list, it will iterate all the error message
-                for error in errors:
-                    if error in r.text:
-                        error_flag = False
-                        break
-            if error_flag:
-                query_status = QueryStatus.CLAIMED
-            else:
-                query_status = QueryStatus.AVAILABLE
-        elif error_type == "status_code":
-            error_codes = net_info.get("errorCode")
-            query_status = QueryStatus.CLAIMED
-
-            # Type consistency, allowing for both singlets and lists in manifest
-            if isinstance(error_codes, int):
-                error_codes = [error_codes]
-
-            if error_codes is not None and r.status_code in error_codes:
-                query_status = QueryStatus.AVAILABLE
-            elif r.status_code >= 300 or r.status_code < 200:
-                query_status = QueryStatus.AVAILABLE
-        elif error_type == "response_url":
-            # For this detection method, we have turned off the redirect.
-            # So, there is no need to check the response URL: it will always
-            # match the request.  Instead, we will ensure that the response
-            # code indicates that the request was successful (i.e. no 404, or
-            # forward to some odd redirect).
-            if 200 <= r.status_code < 300:
-                query_status = QueryStatus.CLAIMED
-            else:
-                query_status = QueryStatus.AVAILABLE
        else:
-            # It should be impossible to ever get here...
-            raise ValueError(
-                f"Unknown Error Type '{error_type}' for " f"site '{social_network}'"
-            )
+            if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type):
+                error_context = f"Unknown error type '{error_type}' for {social_network}"
+                query_status = QueryStatus.UNKNOWN
+            else:
+                if "message" in error_type:
+                    # error_flag True denotes no error found in the HTML
+                    # error_flag False denotes error found in the HTML
+                    error_flag = True
+                    errors = net_info.get("errorMsg")
+                    # errors will hold the error message
+                    # it can be string or list
+                    # by isinstance method we can detect that
+                    # and handle the case for strings as normal procedure
+                    # and if its list we can iterate the errors
+                    if isinstance(errors, str):
+                        # Checks if the error message is in the HTML
+                        # if error is present we will set flag to False
+                        if errors in r.text:
+                            error_flag = False
+                    else:
+                        # If it's list, it will iterate all the error message
+                        for error in errors:
+                            if error in r.text:
+                                error_flag = False
+                                break
+                    if error_flag:
+                        query_status = QueryStatus.CLAIMED
+                    else:
+                        query_status = QueryStatus.AVAILABLE
+
+                if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE:
+                    error_codes = net_info.get("errorCode")
+                    query_status = QueryStatus.CLAIMED
+
+                    # Type consistency, allowing for both singlets and lists in manifest
+                    if isinstance(error_codes, int):
+                        error_codes = [error_codes]
+
+                    if error_codes is not None and r.status_code in error_codes:
+                        query_status = QueryStatus.AVAILABLE
+                    elif r.status_code >= 300 or r.status_code < 200:
+                        query_status = QueryStatus.AVAILABLE
+
+                if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE:
+                    # For this detection method, we have turned off the redirect.
+                    # So, there is no need to check the response URL: it will always
+                    # match the request.  Instead, we will ensure that the response
+                    # code indicates that the request was successful (i.e. no 404, or
+                    # forward to some odd redirect).
+                    if 200 <= r.status_code < 300:
+                        query_status = QueryStatus.CLAIMED
+                    else:
+                        query_status = QueryStatus.AVAILABLE

        if dump_response:
            print("+++++++++++++++++++++")
@@ -596,22 +568,6 @@ def main():
        dest="output",
        help="If using single username, the output of the result will be saved to this file.",
    )
-    parser.add_argument(
-        "--tor",
-        "-t",
-        action="store_true",
-        dest="tor",
-        default=False,
-        help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.",
-    )
-    parser.add_argument(
-        "--unique-tor",
-        "-u",
-        action="store_true",
-        dest="unique_tor",
-        default=False,
-        help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.",
-    )
    parser.add_argument(
        "--csv",
        action="store_true",
@@ -719,12 +675,22 @@ def main():
        help="Include checking of NSFW sites from default list.",
    )

+    # TODO deprecated in favor of --txt, retained for workflow compatibility, to be removed
+    # in future release
    parser.add_argument(
        "--no-txt",
        action="store_true",
        dest="no_txt",
        default=False,
-        help="Disable creation of a txt file",
+        help="Disable creation of a txt file - WILL BE DEPRECATED",
+    )
+
+    parser.add_argument(
+        "--txt",
+        action="store_true",
+        dest="output_txt",
+        default=False,
+        help="Enable creation of a txt file",
    )

    parser.add_argument(
@@ -742,7 +708,7 @@ def main():

    # Check for newer version of Sherlock. If it exists, let the user know about it
    try:
-        latest_release_raw = requests.get(forge_api_latest_release).text
+        latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text
        latest_release_json = json_loads(latest_release_raw)
        latest_remote_tag = latest_release_json["tag_name"]

@@ -755,22 +721,10 @@ def main():
    except Exception as error:
        print(f"A problem occurred while checking for an update: {error}")

-    # Argument check
-    # TODO regex check on args.proxy
-    if args.tor and (args.proxy is not None):
-        raise Exception("Tor and Proxy cannot be set at the same time.")
-
    # Make prompts
    if args.proxy is not None:
        print("Using the proxy: " + args.proxy)

-    if args.tor or args.unique_tor:
-        print("Using Tor to make requests")
-
-        print(
-            "Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors."
-        )
-
    if args.no_color:
        # Disable color output.
        init(strip=True, convert=False)
@@ -802,7 +756,7 @@ def main():
                if args.json_file.isnumeric():
                    pull_number = args.json_file
                    pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}"
-                    pull_request_raw = requests.get(pull_url).text
+                    pull_request_raw = requests.get(pull_url, timeout=10).text
                    pull_request_json = json_loads(pull_request_raw)

                    # Check if it's a valid pull request
@@ -871,8 +825,6 @@ def main():
            username,
            site_data,
            query_notify,
-            tor=args.tor,
-            unique_tor=args.unique_tor,
            dump_response=args.dump_response,
            proxy=args.proxy,
            timeout=args.timeout,
@@ -888,7 +840,7 @@ def main():
        else:
            result_file = f"{username}.txt"

-        if not args.no_txt:
+        if args.output_txt:
            with open(result_file, "w", encoding="utf-8") as file:
                exists_counter = 0
                for website_name in results:
@@ -129,7 +129,7 @@ class SitesInformation:
        if data_file_path.lower().startswith("http"):
            # Reference is to a URL.
            try:
-                response = requests.get(url=data_file_path)
+                response = requests.get(url=data_file_path, timeout=30)
            except Exception as error:
                raise FileNotFoundError(
                    f"Problem while attempting to access data file URL '{data_file_path}':  {error}"
@@ -166,7 +166,7 @@ class SitesInformation:

        if honor_exclusions:
            try:
-                response = requests.get(url=EXCLUSIONS_URL)
+                response = requests.get(url=EXCLUSIONS_URL, timeout=10)
                if response.status_code == 200:
                    exclusions = response.text.splitlines()
                    exclusions = [exclusion.strip() for exclusion in exclusions]
@@ -0,0 +1,84 @@
+import requests
+import yaml
+
+
+NUCLEI_FINGERPRINT_URL: str = "https://raw.githubusercontent.com/projectdiscovery/nuclei-templates/refs/heads/main/http/global-matchers/global-waf-detect.yaml"
+
+def _check_nuclei_regex(matcher: dict[str,str|list[str]], response: requests.Response) -> bool:
+    import re
+
+    and_cond: bool = matcher.get('condition', '') == 'and'
+
+    target_text: str
+    if matcher['part'] == 'body':
+        target_text = response.text
+    elif matcher['part'] == 'header':
+        target_text = str(response.headers)
+    else:
+        target_text = response.text + str(response.headers)
+
+    for regex in matcher['regex']:
+        if re.search(regex, target_text):
+            if not and_cond:
+                return True
+        else:
+            break
+    else:
+        # `and` conditions will cycle, resulting in this default return True
+        # unless an early failed detection breaks the loop (resulting in False)
+        return True
+    return False
+
+def _check_nuclei_words(matcher: dict[str,str|list[str]], response: requests.Response) -> bool:
+    and_cond: bool = matcher.get('condition', '') == 'and'
+
+    target_text: str
+    if matcher['part'] == 'body':
+        target_text = response.text
+    elif matcher['part'] == 'header':
+        target_text = str(response.headers)
+    else:
+        target_text = response.text + str(response.headers)
+
+    for word in matcher['words']:
+        if word in target_text:
+            if not and_cond:
+                return True
+        else:
+            break
+    else:
+        # `and` conditions will cycle, resulting in this default return True
+        # unless an early failed detection breaks the loop (resulting in False)
+        return True
+    return False
+
+def fetch_nuclei_fingerprints() -> list[dict[str,str|list[str]]] | None:
+    """Fetch the latest Nuclei WAF fingerprints from the official repository."""
+    try:
+        response = requests.get(NUCLEI_FINGERPRINT_URL, timeout=10)
+        response.raise_for_status()
+        raw = yaml.safe_load(response.text)
+        fingerprints: list[dict[str,str|list[str]]] = raw['http'][0]['matchers']
+        return fingerprints
+    except requests.RequestException as e:
+        print(f"Error fetching Nuclei fingerprints: {e}")
+        return None
+    except yaml.YAMLError as e:
+        print(f"Error parsing YAML data: {e}")
+        return None
+
+def nuclei_check(response: requests.Response, fingerprints: list[dict[str,str|list[str]]]) -> bool:
+    """Check if the response matches any of the WAF fingerprints.
+
+    Keyword arguments:
+    response -- The HTTP response to check.
+    fingerprints -- The list of Nuclei WAF fingerprints to check against.
+
+    Returns True if a WAF is detected, False otherwise.
+    """
+    for matcher in fingerprints:
+        if matcher['type'] == 'word':
+            return _check_nuclei_words(matcher, response)
+        elif matcher['type'] == 'regex':
+            return _check_nuclei_regex(matcher, response)
+    return False
@@ -0,0 +1,26 @@
+id: global-waf-detect
+http:
+  - global-matchers: true
+    matchers-condition: or
+    matchers:
+      - type: regex
+        name: regexSite
+        regex:
+          - '(?i)access.to.this.page.has.been.denied'
+          - '(?i)http(s)?://(www.)?anotheroneblocked.\w+.whywasiblocked'
+        condition: or
+        part: response
+
+      - type: word
+        name: wordSiteBody
+        part: body
+        words:
+          - "bad_text_in_body"
+
+      - type: word
+        name: wordSiteHead
+        part: header
+        condition: or
+        words:
+          - "text_in_head"
+          - "other_in_head"
@@ -0,0 +1,107 @@
+import os
+import unittest
+from unittest.mock import patch, Mock
+import requests
+from requests.structures import CaseInsensitiveDict
+import yaml
+
+from sherlock_project import waf_check
+
+
+TEMPLATE_BODY_PATH: str = os.path.join(os.path.dirname(__file__), 'mocks', 'global_waf_detect.yaml')
+
+def side_effect(url, **kwargs) -> Mock:
+    if url == waf_check.NUCLEI_FINGERPRINT_URL:
+        with open(TEMPLATE_BODY_PATH, 'r', encoding='utf-8') as file:
+            template_body: str = file.read()
+        mock_response: Mock = Mock()
+        mock_response.status_code = 200
+        mock_response.text = template_body
+        return mock_response
+    raise RuntimeError("Unexpected URL")
+
+class TestWafCheck(unittest.TestCase):
+
+    @patch('sherlock_project.waf_check.requests.get')
+    def test_fetch_nuclei_fingerprints(self, mock_requests_get): # type: ignore
+        mock_requests_get.side_effect = side_effect
+
+        result = waf_check.fetch_nuclei_fingerprints()
+
+        with open(TEMPLATE_BODY_PATH, 'r', encoding='utf-8') as file:
+            template_body: str = file.read()
+
+        expected: list[dict[str, str | list[str]]] = yaml.safe_load(template_body)['http'][0]['matchers']
+        self.assertEqual(result, expected)
+
+    def test_nuclei_regex_check(self):
+        mock_res: requests.Response = requests.Response()
+        mock_res.status_code = 200
+        mock_res._content = b"This is a test response with Test-Regex in the body."
+        mock_res.headers = CaseInsensitiveDict({
+            'Content-Type': 'text/html',
+            'Server': 'TestServer'
+        })
+        matcher: dict[str, str | list[str]] = {
+            'type': 'regex',
+            'name': 'test-regex',
+            'part': 'body',
+            'regex': [r'(?i)not-present'],
+            'condition': 'or'
+        }
+        self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['regex'] = [r'(?i)TeSt-REgEx']
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['regex'] = [r'(?i)TeSt-REgEx', r'(?i)Not-Present']
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['condition'] = 'and'
+        self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'header'
+        matcher['regex'] = [r'(?i)testserver']
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'response'
+        self.assertTrue(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['regex'] = [r'(?i)not-present']
+        self.assertFalse(waf_check._check_nuclei_regex(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+    def test_nuclei_words_check(self):
+        mock_res: requests.Response = requests.Response()
+        mock_res.status_code = 200
+        mock_res._content = b"This is a test response with test-words in the body."
+        mock_res.headers = CaseInsensitiveDict({
+            'Content-Type': 'text/html',
+            'Server': 'TestServer'
+        })
+        matcher: dict[str, str | list[str]] = {
+            'type': 'word',
+            'name': 'test-word',
+            'part': 'body',
+            'words': ['not-present'],
+            'condition': 'or'
+        }
+        self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['words'] = ['test-word']
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['words'] = ['test-word', 'Not-Present']
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['condition'] = 'and'
+        self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'header'
+        matcher['words'] = ['testserver']
+        self.assertFalse(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['words'] = ['TestServer']
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
+
+        matcher['part'] = 'response'
+        self.assertTrue(waf_check._check_nuclei_words(matcher, mock_res)) # pyright: ignore[reportPrivateUsage]
Author	SHA1	Message	Date
Paul Pfeister	d25848cc5f	chore: remote waf fingerprinting base	2025-10-04 23:54:29 -04:00
Paul Pfeister	f32f4ffaee	Merge pull request #2595 from obiwan04kanobi/feature/issue-2196-ci-docker-build-test Add Docker build test to CI workflow (#2196)	2025-10-04 21:09:04 -04:00
Paul Pfeister	7379ba7b19	Merge branch 'remove-tor'	2025-10-04 20:52:40 -04:00
Paul Pfeister	3aeb6d6356	Merge pull request #2602 from sherlock-project/feat/no-txt chore: make default --no-txt	2025-10-04 20:36:33 -04:00
Paul Pfeister	4246a7b16f	chore: make default --no-txt Workflows where a txt file is still required should use --txt	2025-10-04 20:32:16 -04:00
Paul Pfeister	e44fe49c8f	Merge pull request #2601 from sherlock-project/feat/graceful-skip feat: gracefully skip sites with invalid errorType	2025-10-04 20:23:07 -04:00
Paul Pfeister	52cd5fdfc1	feat: gracefully skip sites with invalid errorType	2025-10-04 20:22:34 -04:00
Paul Pfeister	947f1ad2b6	Merge pull request #2574 from dollaransh17/fix/http-request-timeouts Security Fix: Add timeout parameters to HTTP requests	2025-10-04 18:42:13 -04:00
Paul Pfeister	cfcc82aaca	Merge pull request #2597 from sherlock-project/feat/multiple-types Support multiple errorType checks	2025-10-04 17:21:26 -04:00
Paul Pfeister	0794e02b52	feat: support multiple errorTypes	2025-10-04 16:53:30 -04:00
Paul Pfeister	975965abed	Merge pull request #2589 from dollaransh17/fix/threads-false-positive fix(sites): Fix Threads false positive detection	2025-10-04 15:44:04 -04:00
Paul Pfeister	a678bed154	Merge pull request #2587 from akh7177/remediate-cyberdefenders-fp fix(sites): Remediate False Positives for CyberDefenders	2025-10-04 15:43:48 -04:00
Paul Pfeister	4ec6f1eec0	Merge pull request #2585 from akh7177/remediate-slideshare-fp fix(sites): Remediate False Positive for SlideShare	2025-10-04 15:43:36 -04:00
Paul Pfeister	d1527376e7	Merge pull request #2584 from akh7177/remediate-roblox-fp fix(sites): Remediate False Positive for Roblox	2025-10-04 15:43:29 -04:00
obiwan04kanobi	b99719ce60	Add Docker build test to CI workflow - Adds docker-build-test job to regression.yml - Runs on push/merge to master and release branches - Extracts VERSION_TAG from pyproject.toml for build - Tests that Docker image builds and runs successfully - Resolves dockerfile syntax warnings - Resolves #2196"	2025-10-05 00:22:12 +05:30
dollaransh17	dc869852bc	fix(sites): Fix Threads false positive detection Threads was showing false positives for non-existent users because the error message detection was incorrect. Updated errorMsg: - Old: "<title>Threads</title>" (generic, matches valid pages too) - New: "<title>Threads • Log in</title>" (specific to non-existent users) When a user doesn't exist, Threads redirects to a login page with the title "Threads • Log in". Valid user profiles have titles like "Username (@username) • Threads, Say more". Tested with: - Invalid user (impossibleuser12345): Correctly not found - Valid user (zuck): Correctly found This fixes the false positive issue where non-existent Threads profiles were being reported as found.	2025-10-04 17:22:50 +05:30
Abhyuday K Hegde	5cd769c2f4	Remediate False Positives for CyberDefenders	2025-10-04 15:12:20 +05:30
Abhyuday K Hegde	977ad5c1a4	Remediate False Positive for SlideShare	2025-10-04 14:48:37 +05:30
Abhyuday K Hegde	57a0ccef38	Remediate False Positive for Roblox	2025-10-04 14:30:40 +05:30
dollaransh17	0e7219b191	Security Fix: Add timeout parameters to HTTP requests This fix addresses a critical security vulnerability where HTTP requests could hang indefinitely, potentially causing denial of service. Changes: - Added 10-second timeout to version check API call - Added 10-second timeout to GitHub pull request API call - Added 30-second timeout to data file downloads (larger timeout for data) - Added 10-second timeout to exclusions list download Impact: - Prevents infinite hangs that could freeze the application - Improves user experience with predictable response times - Fixes security issue flagged by Bandit static analysis (B113) - Makes the application more robust in poor network conditions The timeouts are conservative enough to work with slow connections while preventing indefinite blocking that could be exploited.	2025-10-03 13:41:43 +05:30
Paul Pfeister	1d2c4b134f	Merge pull request #2570 from shreyasNaik0101/fix/remediate-applediscussions fix(sites): Remediate false positive for Apple Discussions	2025-10-02 20:30:57 -04:00
shreyasNaik0101	b245c462c9	fix(sites): Remediate false positive for Apple Discussions	2025-10-03 05:56:52 +05:30
Paul Pfeister	66d9733da7	Merge pull request #2565 from shreyasNaik0101/fix/remediate-mydramalist fix(sites): Remediate false positive for Mydramalist	2025-10-02 19:40:47 -04:00
Paul Pfeister	c55deab3a2	Merge pull request #2561 from shreyasNaik0101/fix/remediate-deviantart fix(sites): Remediate false positive for DeviantArt	2025-10-02 19:37:00 -04:00
Paul Pfeister	edcb697793	Merge pull request #2564 from shreyasNaik0101/fix/remediate-allmylinks fix(sites): Remediate false positive for AllMyLinks	2025-10-02 19:36:43 -04:00
shreyasNaik0101	d314d75db1	fix(sites): Remediate false positive for Mydramalist	2025-10-03 04:43:05 +05:30
shreyasNaik0101	c89a52caf7	fix(sites): Remediate false positive for AllMyLinks	2025-10-03 04:25:46 +05:30
Paul Pfeister	9c18cfe273	Merge pull request #2563 from sherlock-project/chore/update-co chore: update code owners	2025-10-02 18:25:59 -04:00
shreyasNaik0101	779d4c33f4	fix: Remove username_unclaimed as requested	2025-10-03 03:55:03 +05:30
Paul Pfeister	072c24687b	Merge pull request #2558 from hanjm-github/master feat: Add some popular website in Korea	2025-10-02 18:22:42 -04:00
Paul Pfeister	b811b2bd47	chore: update code owners	2025-10-02 18:21:20 -04:00
shreyasNaik0101	355bfbd328	fix(sites): Remediate false positive for DeviantArt	2025-10-03 00:42:07 +05:30
JongMyeong HAN	7b3632bdad	Add comment to site 'namuwiki' Co-authored-by: Paul Pfeister <code@pfeister.dev>	2025-10-03 04:00:41 +09:00
JongMyeong HAN	cd7c52e4fa	Feat: Add tistory	2025-10-01 00:44:55 +09:00
JongMyeong HAN	86140af50e	feat: Add SOOP	2025-10-01 00:44:02 +09:00
JongMyeong HAN	e5cd5e5bfe	feat: Add namuwiki	2025-10-01 00:43:21 +09:00
JongMyeong HAN	dc89f1cd27	feat: Add dcinside	2025-10-01 00:41:23 +09:00
Paul Pfeister	2016892e64	Remove torrequest dep Not sure why it's not in my patch file, but I was removing via sed in my spec instead.	2024-06-28 23:39:38 -04:00
Paul Pfeister	44ad8f506a	Lint	2024-06-28 23:38:44 -04:00
Siddharth Dushantha	cfa4097df9	removed support for tor	2024-06-26 21:57:11 +02:00