1 Commits

Author SHA1 Message Date
Paul Pfeister 707597202d fix: mastodon.cloud
Co-authored-by: dollaransh17 <dollaransh17@users.noreply.github.com>
2025-10-05 14:32:54 -04:00
16 changed files with 380 additions and 908 deletions
+1 -1
View File
@@ -65,7 +65,7 @@ The Actor provides three types of outputs:
| Field | Type | Required | Description |
|-------|------|----------|-------------|
| `username` | string | Yes | Username the search was conducted for |
| `links` | array | Yes | Array with found links to the social media |
| `links` | arrray | Yes | Array with found links to the social media |
| `links[]`| string | No | URL to the account
### Example Dataset Item (JSON)
+1 -1
View File
@@ -1,5 +1,5 @@
### REPOSITORY
/.github/CODEOWNERS @sdushantha @ppfeister
/.github/CODEOWNERS @sdushantha
/.github/FUNDING.yml @sdushantha
/LICENSE @sdushantha
+6 -31
View File
@@ -11,7 +11,6 @@ on:
- '**/*.py'
- '**/*.ini'
- '**/*.toml'
- 'Dockerfile'
push:
branches:
- master
@@ -22,17 +21,15 @@ on:
- '**/*.py'
- '**/*.ini'
- '**/*.toml'
- 'Dockerfile'
jobs:
tox-lint:
# Linting is ran through tox to ensure that the same linter is used by local runners
runs-on: ubuntu-latest
# Linting is run through tox to ensure that the same linter
# is used by local runners
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@v4
- name: Set up linting environment
uses: actions/setup-python@v6
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install tox and related dependencies
@@ -44,8 +41,7 @@ jobs:
tox-matrix:
runs-on: ${{ matrix.os }}
strategy:
# We want to know what specific versions it fails on
fail-fast: false
fail-fast: false # We want to know what specicic versions it fails on
matrix:
os: [
ubuntu-latest,
@@ -57,13 +53,11 @@ jobs:
'3.11',
'3.12',
'3.13',
'3.14',
'3.14t',
]
steps:
- uses: actions/checkout@v6
- uses: actions/checkout@v4
- name: Set up environment ${{ matrix.python-version }}
uses: actions/setup-python@v6
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install tox and related dependencies
@@ -73,22 +67,3 @@ jobs:
pip install tox-gh-actions
- name: Run tox
run: tox
docker-build-test:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Get version from pyproject.toml
id: get-version
run: |
VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2)
echo "version=$VERSION" >> $GITHUB_OUTPUT
- name: Build Docker image
run: |
docker build \
--build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \
-t sherlock-test:latest .
- name: Test Docker image runs
run: docker run --rm sherlock-test:latest --version
+13 -40
View File
@@ -17,41 +17,29 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v5
with:
# Checkout the base branch but fetch all history to avoid a second fetch call
ref: ${{ github.base_ref }}
fetch-depth: 0
persist-credentials: false
fetch-depth: 1
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: "3.13"
python-version: '3.13'
- name: Install Poetry
uses: abatilo/actions-poetry@v4
with:
poetry-version: "latest"
poetry-version: 'latest'
- name: Install dependencies
run: |
poetry install --no-interaction --with dev
- name: Prepare JSON versions for comparison
- name: Drop in place updated manifest from base
run: |
# Fetch only the PR's branch head (single network call in this step)
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr
# Find the merge-base commit between the target branch and the PR branch
MERGE_BASE=$(git merge-base origin/${{ github.base_ref }} pr)
echo "Comparing PR head against merge-base commit: $MERGE_BASE"
# Safely extract the file from the PR's head and the merge-base commit
git show pr:sherlock_project/resources/data.json > data.json.head
git show $MERGE_BASE:sherlock_project/resources/data.json > data.json.base
# CRITICAL FIX: Overwrite the checked-out data.json with the one from the PR
# This ensures that pytest runs against the new, updated file.
cp data.json.head sherlock_project/resources/data.json
cp sherlock_project/resources/data.json data.json.base
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr --depth=1
git show pr:sherlock_project/resources/data.json > sherlock_project/resources/data.json
cp sherlock_project/resources/data.json data.json.head
- name: Discover modified targets
id: discover-modified
@@ -59,16 +47,8 @@ jobs:
CHANGED=$(
python - <<'EOF'
import json
import sys
try:
with open("data.json.base") as f: base = json.load(f)
with open("data.json.head") as f: head = json.load(f)
except FileNotFoundError as e:
print(f"Error: Could not find {e.filename}", file=sys.stderr)
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Could not decode JSON from a file - {e}", file=sys.stderr)
sys.exit(1)
with open("data.json.base") as f: base = json.load(f)
with open("data.json.head") as f: head = json.load(f)
changed = []
for k, v in head.items():
@@ -83,19 +63,12 @@ jobs:
echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')"
echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT"
- name: Validate remote manifest against local schema
if: steps.discover-modified.outputs.changed_targets != ''
run: |
poetry run pytest tests/test_manifest.py::test_validate_manifest_against_local_schema
# --- The rest of the steps below are unchanged ---
- name: Validate modified targets
env:
CHANGED_TARGETS: ${{ steps.discover-modified.outputs.changed_targets }}
if: steps.discover-modified.outputs.changed_targets != ''
continue-on-error: true
run: |
poetry run pytest -q --tb no -rA -m validate_targets -n 20 \
--chunked-sites "$CHANGED_TARGETS" \
--chunked-sites "${{ steps.discover-modified.outputs.changed_targets }}" \
--junitxml=validation_results.xml
- name: Prepare validation summary
+1 -1
View File
@@ -4,7 +4,7 @@
# 3. Build image with BOTH latest and version tags
# i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .`
FROM python:3.12-slim-bullseye AS build
FROM python:3.12-slim-bullseye as build
WORKDIR /sherlock
RUN pip3 install --no-cache-dir --upgrade pip
+11 -17
View File
@@ -1,45 +1,39 @@
#!/usr/bin/env python
# This module generates the listing of supported sites which can be found in
# sites.mdx. It also organizes all the sites in alphanumeric order
# sites.md. It also organizes all the sites in alphanumeric order
import json
import os
DATA_REL_URI: str = "sherlock_project/resources/data.json"
DEFAULT_ENCODING = "utf-8"
# Read the data.json file
with open(DATA_REL_URI, "r", encoding=DEFAULT_ENCODING) as data_file:
with open(DATA_REL_URI, "r", encoding="utf-8") as data_file:
data: dict = json.load(data_file)
# Removes schema-specific keywords for proper processing
social_networks = data.copy()
social_networks: dict = dict(data)
social_networks.pop('$schema', None)
# Sort the social networks in alphanumeric order
social_networks = sorted(social_networks.items())
social_networks: list = sorted(social_networks.items())
# Make output dir where the site list will be written
os.mkdir("output")
# Write the list of supported sites to sites.mdx
with open("output/sites.mdx", "w", encoding=DEFAULT_ENCODING) as site_file:
site_file.write("---\n")
site_file.write("title: 'List of supported sites'\n")
site_file.write("sidebarTitle: 'Supported sites'\n")
site_file.write("icon: 'globe'\n")
site_file.write("description: 'Sherlock currently supports **400+** sites'\n")
site_file.write("---\n\n")
# Write the list of supported sites to sites.md
with open("output/sites.mdx", "w") as site_file:
site_file.write("---\ntitle: 'List of supported sites'\nsidebarTitle: 'Supported sites'\nicon: 'globe'\ndescription: 'Sherlock currently supports **400+** sites'\n---\n\n")
for social_network, info in social_networks:
url_main = info["urlMain"]
is_nsfw = "**(NSFW)**" if info.get("isNSFW") else ""
site_file.write(f"1. [{social_network}]({url_main}) {is_nsfw}\n")
# Overwrite the data.json file with sorted data
with open(DATA_REL_URI, "w", encoding=DEFAULT_ENCODING) as data_file:
with open(DATA_REL_URI, "w") as data_file:
sorted_data = json.dumps(data, indent=2, sort_keys=True)
data_file.write(sorted_data)
data_file.write("\n") # Keep the newline after writing data
data_file.write("\n")
print("Finished updating supported site listing!")
+47 -19
View File
@@ -23,17 +23,17 @@
> [!WARNING]
> Packages for ParrotOS and Ubuntu 24.04, maintained by a third party, appear to be __broken__.
> Users of these systems should defer to [`uv`](https://docs.astral.sh/uv/)/`pipx`/`pip` or Docker.
> Users of these systems should defer to pipx/pip or Docker.
| Method | Notes |
| - | - |
| `pipx install sherlock-project` | `pip` or [`uv`](https://docs.astral.sh/uv/) may be used in place of `pipx` |
| `pipx install sherlock-project` | `pip` may be used in place of `pipx` |
| `docker run -it --rm sherlock/sherlock` |
| `dnf install sherlock-project` | |
Community-maintained packages are available for Debian (>= 13), Ubuntu (>= 22.10), Homebrew, Kali, and BlackArch. These packages are not directly supported or maintained by the Sherlock Project.
See all alternative installation methods [here](https://sherlockproject.xyz/installation).
See all alternative installation methods [here](https://sherlockproject.xyz/installation)
## General usage
@@ -51,42 +51,70 @@ Accounts found will be stored in an individual text file with the corresponding
```console
$ sherlock --help
usage: sherlock [-h] [--version] [--verbose] [--folderoutput FOLDEROUTPUT] [--output OUTPUT] [--csv] [--xlsx] [--site SITE_NAME] [--proxy PROXY_URL] [--dump-response]
[--json JSON_FILE] [--timeout TIMEOUT] [--print-all] [--print-found] [--no-color] [--browse] [--local] [--nsfw] [--txt] [--ignore-exclusions]
usage: sherlock [-h] [--version] [--verbose] [--folderoutput FOLDEROUTPUT]
[--output OUTPUT] [--tor] [--unique-tor] [--csv] [--xlsx]
[--site SITE_NAME] [--proxy PROXY_URL] [--json JSON_FILE]
[--timeout TIMEOUT] [--print-all] [--print-found] [--no-color]
[--browse] [--local] [--nsfw]
USERNAMES [USERNAMES ...]
Sherlock: Find Usernames Across Social Networks (Version 0.16.0)
Sherlock: Find Usernames Across Social Networks (Version 0.14.3)
positional arguments:
USERNAMES One or more usernames to check with social networks. Check similar usernames using {?} (replace to '_', '-', '.').
USERNAMES One or more usernames to check with social networks.
Check similar usernames using {?} (replace to '_', '-', '.').
options:
optional arguments:
-h, --help show this help message and exit
--version Display version information and dependencies.
--verbose, -v, -d, --debug
Display extra debugging information and metrics.
--folderoutput FOLDEROUTPUT, -fo FOLDEROUTPUT
If using multiple usernames, the output of the results will be saved to this folder.
If using multiple usernames, the output of the results will be
saved to this folder.
--output OUTPUT, -o OUTPUT
If using single username, the output of the result will be saved to this file.
If using single username, the output of the result will be saved
to this file.
--tor, -t Make requests over Tor; increases runtime; requires Tor to be
installed and in system path.
--unique-tor, -u Make requests over Tor with new Tor circuit after each request;
increases runtime; requires Tor to be installed and in system
path.
--csv Create Comma-Separated Values (CSV) File.
--xlsx Create the standard file for the modern Microsoft Excel spreadsheet (xlsx).
--site SITE_NAME Limit analysis to just the listed sites. Add multiple options to specify more than one site.
--xlsx Create the standard file for the modern Microsoft Excel
spreadsheet (xlsx).
--site SITE_NAME Limit analysis to just the listed sites. Add multiple options to
specify more than one site.
--proxy PROXY_URL, -p PROXY_URL
Make requests over a proxy. e.g. socks5://127.0.0.1:1080
--dump-response Dump the HTTP response to stdout for targeted debugging.
--json JSON_FILE, -j JSON_FILE
Load data from a JSON file or an online, valid, JSON file. Upstream PR numbers also accepted.
Load data from a JSON file or an online, valid, JSON file.
--timeout TIMEOUT Time (in seconds) to wait for response to requests (Default: 60)
--print-all Output sites where the username was not found.
--print-found Output sites where the username was found (also if exported as file).
--print-found Output sites where the username was found.
--no-color Don't color terminal output
--browse, -b Browse to all results on default browser.
--local, -l Force the use of the local data.json file.
--nsfw Include checking of NSFW sites from default list.
--txt Enable creation of a txt file
--ignore-exclusions Ignore upstream exclusions (may return more false positives)
```
## Apify Actor Usage [![Sherlock Actor](https://apify.com/actor-badge?actor=netmilk/sherlock)](https://apify.com/netmilk/sherlock?fpr=sherlock)
<a href="https://apify.com/netmilk/sherlock?fpr=sherlock"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Sherlock Actor on Apify" width="176" height="39" /></a>
You can run Sherlock in the cloud without installation using the [Sherlock Actor](https://apify.com/netmilk/sherlock?fpr=sherlock) on [Apify](https://apify.com?fpr=sherlock) free of charge.
``` bash
$ echo '{"usernames":["user123"]}' | apify call -so netmilk/sherlock
[{
"username": "user123",
"links": [
"https://www.1337x.to/user/user123/",
...
]
}]
```
Read more about the [Sherlock Actor](../.actor/README.md), including how to use it programmatically via the Apify [API](https://apify.com/netmilk/sherlock/api?fpr=sherlock), [CLI](https://docs.apify.com/cli/?fpr=sherlock) and [JS/TS and Python SDKs](https://docs.apify.com/sdk?fpr=sherlock).
## Credits
@@ -96,7 +124,7 @@ Thank you to everyone who has contributed to Sherlock! ❤️
<img src="https://contrib.rocks/image?&columns=25&max=10000&&repo=sherlock-project/sherlock" alt="contributors"/>
</a>
## Star History
## Star history
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=sherlock-project/sherlock&type=Date&theme=dark" />
@@ -107,7 +135,7 @@ Thank you to everyone who has contributed to Sherlock! ❤️
## License
MIT © Sherlock Project<br/>
Creator - [Siddharth Dushantha](https://github.com/sdushantha)
Original Creator - [Siddharth Dushantha](https://github.com/sdushantha)
<!-- Reference Links -->
+5 -5
View File
@@ -8,7 +8,7 @@ source = "init"
[tool.poetry]
name = "sherlock-project"
version = "0.16.1"
version = "0.16.0"
description = "Hunt down social media accounts by username across social networks"
license = "MIT"
authors = [
@@ -29,10 +29,6 @@ classifiers = [
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Security"
]
homepage = "https://sherlockproject.xyz/"
@@ -50,10 +46,14 @@ PySocks = "^1.7.0"
requests = "^2.22.0"
requests-futures = "^1.0.0"
stem = "^1.8.0"
torrequest = "^0.1.0"
pandas = "^2.2.1"
openpyxl = "^3.0.10"
tomli = "^2.2.1"
[tool.poetry.extras]
tor = ["torrequest"]
[tool.poetry.group.dev.dependencies]
jsonschema = "^4.0.0"
rstr = "^3.2.2"
+9 -2
View File
@@ -37,6 +37,7 @@ class QueryNotify:
self.result = result
# return
def start(self, message=None):
"""Notify Start.
@@ -55,6 +56,7 @@ class QueryNotify:
Nothing.
"""
# return
def update(self, result):
"""Notify Update.
@@ -73,6 +75,7 @@ class QueryNotify:
self.result = result
# return
def finish(self, message=None):
"""Notify Finish.
@@ -91,6 +94,7 @@ class QueryNotify:
Nothing.
"""
# return
def __str__(self):
"""Convert Object To String.
@@ -133,6 +137,7 @@ class QueryNotifyPrint(QueryNotify):
self.print_all = print_all
self.browse = browse
return
def start(self, message):
"""Notify Start.
@@ -158,6 +163,7 @@ class QueryNotifyPrint(QueryNotify):
# An empty line between first line and the result(more clear output)
print('\r')
return
def countResults(self):
"""This function counts the number of results. Every time the function is called,
@@ -232,7 +238,7 @@ class QueryNotifyPrint(QueryNotify):
Fore.WHITE + "]" +
Fore.GREEN + f" {self.result.site_name}:" +
Fore.YELLOW + f" {msg}")
elif result.status == QueryStatus.WAF:
if self.print_all:
print(Style.BRIGHT + Fore.WHITE + "[" +
@@ -248,9 +254,10 @@ class QueryNotifyPrint(QueryNotify):
f"Unknown Query Status '{result.status}' for site '{self.result.site_name}'"
)
return
def finish(self, message="The processing has been finished."):
"""Notify Finish.
"""Notify Start.
Will print the last line to the standard output.
Keyword Arguments:
self -- This object.
File diff suppressed because it is too large Load Diff
+74 -143
View File
@@ -1,149 +1,80 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "Sherlock Target Manifest",
"description": "Social media targets to probe for the existence of known usernames",
"type": "object",
"properties": {
"$schema": { "type": "string" }
},
"patternProperties": {
"^(?!\\$).*?$": {
"type": "object",
"description": "Target name and associated information (key should be human readable name)",
"required": ["url", "urlMain", "errorType", "username_claimed"],
"properties": {
"url": { "type": "string" },
"urlMain": { "type": "string" },
"urlProbe": { "type": "string" },
"username_claimed": { "type": "string" },
"regexCheck": { "type": "string" },
"isNSFW": { "type": "boolean" },
"headers": { "type": "object" },
"request_payload": { "type": "object" },
"__comment__": {
"type": "string",
"description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
},
"tags": {
"oneOf": [
{ "$ref": "#/$defs/tag" },
{ "type": "array", "items": { "$ref": "#/$defs/tag" } }
]
},
"request_method": {
"type": "string",
"enum": ["GET", "POST", "HEAD", "PUT"]
},
"errorType": {
"oneOf": [
{
"type": "string",
"enum": ["message", "response_url", "status_code"]
"$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "Sherlock Target Manifest",
"description": "Social media targets to probe for the existence of known usernames",
"type": "object",
"properties": {
"$schema": { "type": "string" }
},
"patternProperties": {
"^(?!\\$).*?$": {
"type": "object",
"description": "Target name and associated information (key should be human readable name)",
"required": [ "url", "urlMain", "errorType", "username_claimed" ],
"properties": {
"url": { "type": "string" },
"urlMain": { "type": "string" },
"urlProbe": { "type": "string" },
"username_claimed": { "type": "string" },
"regexCheck": { "type": "string" },
"isNSFW": { "type": "boolean" },
"headers": { "type": "object" },
"request_payload": { "type": "object" },
"__comment__": {
"type": "string",
"description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock."
},
"tags": {
"oneOf": [
{ "$ref": "#/$defs/tag" },
{ "type": "array", "items": { "$ref": "#/$defs/tag" } }
]
},
"request_method": {
"type": "string",
"enum": [ "GET", "POST", "HEAD", "PUT" ]
},
"errorType": {
"type": "string",
"enum": [ "message", "response_url", "status_code" ]
},
"errorMsg": {
"oneOf": [
{ "type": "string" },
{ "type": "array", "items": { "type": "string" } }
]
},
"errorCode": {
"oneOf": [
{ "type": "integer" },
{ "type": "array", "items": { "type": "integer" } }
]
},
"errorUrl": { "type": "string" },
"response_url": { "type": "string" }
},
{
"type": "array",
"items": {
"type": "string",
"enum": ["message", "response_url", "status_code"]
}
}
]
},
"errorMsg": {
"oneOf": [
{ "type": "string" },
{ "type": "array", "items": { "type": "string" } }
]
},
"errorCode": {
"oneOf": [
{ "type": "integer" },
{ "type": "array", "items": { "type": "integer" } }
]
},
"errorUrl": { "type": "string" },
"response_url": { "type": "string" }
},
"dependencies": {
"errorMsg": {
"oneOf": [
{ "properties": { "errorType": { "const": "message" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "message" }
"dependencies": {
"errorMsg": {
"properties" : { "errorType": { "const": "message" } }
},
"errorUrl": {
"properties": { "errorType": { "const": "response_url" } }
},
"errorCode": {
"properties": { "errorType": { "const": "status_code" } }
}
}
}
]
},
"errorUrl": {
"oneOf": [
{ "properties": { "errorType": { "const": "response_url" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "response_url" }
}
}
}
]
},
"errorCode": {
"oneOf": [
{ "properties": { "errorType": { "const": "status_code" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "status_code" }
}
}
}
]
},
"if": { "properties": { "errorType": { "const": "message" } } },
"then": { "required": [ "errorMsg" ] },
"else": {
"if": { "properties": { "errorType": { "const": "response_url" } } },
"then": { "required": [ "errorUrl" ] }
},
"additionalProperties": false
}
},
"allOf": [
{
"if": {
"anyOf": [
{ "properties": { "errorType": { "const": "message" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "message" }
}
}
}
]
},
"then": { "required": ["errorMsg"] }
},
{
"if": {
"anyOf": [
{ "properties": { "errorType": { "const": "response_url" } } },
{
"properties": {
"errorType": {
"type": "array",
"contains": { "const": "response_url" }
}
}
}
]
},
"then": { "required": ["errorUrl"] }
}
],
"additionalProperties": false
},
"additionalProperties": false,
"$defs": {
"tag": { "type": "string", "enum": [ "adult", "gaming" ] }
}
},
"additionalProperties": false,
"$defs": {
"tag": { "type": "string", "enum": ["adult", "gaming"] }
}
}
+121 -66
View File
@@ -136,9 +136,6 @@ def get_response(request_future, error_type, social_network):
except requests.exceptions.RequestException as err:
error_context = "Unknown Error"
exception_text = str(err)
except UnicodeError as err:
error_context = "Encoding Error"
exception_text = str(err)
return response, error_context, exception_text
@@ -174,6 +171,8 @@ def sherlock(
username: str,
site_data: dict[str, dict[str, str]],
query_notify: QueryNotify,
tor: bool = False,
unique_tor: bool = False,
dump_response: bool = False,
proxy: Optional[str] = None,
timeout: int = 60,
@@ -189,6 +188,8 @@ def sherlock(
query_notify -- Object with base type of QueryNotify().
This will be used to notify the caller about
query results.
tor -- Boolean indicating whether to use a tor circuit for the requests.
unique_tor -- Boolean indicating whether to use a new tor circuit for each request.
proxy -- String indicating the proxy URL
timeout -- Time in seconds to wait before timing out request.
Default is 60 seconds.
@@ -209,9 +210,32 @@ def sherlock(
# Notify caller that we are starting the query.
query_notify.start(username)
# Create session based on request methodology
if tor or unique_tor:
try:
from torrequest import TorRequest # noqa: E402
except ImportError:
print("Important!")
print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
print("> If you've installed Sherlock via pip, you can include the optional dependency via `pip install 'sherlock-project[tor]'`.")
print("> Other packages should refer to their documentation, or install it separately with `pip install torrequest`.\n")
sys.exit(query_notify.finish())
# Normal requests
underlying_session = requests.session()
print("Important!")
print("> --tor and --unique-tor are now DEPRECATED, and may be removed in a future release of Sherlock.")
# Requests using Tor obfuscation
try:
underlying_request = TorRequest()
except OSError:
print("Tor not found in system path. Unable to continue.\n")
sys.exit(query_notify.finish())
underlying_session = underlying_request.session
else:
# Normal requests
underlying_session = requests.session()
underlying_request = requests.Request()
# Limit number of workers to 20.
# This is probably vastly overkill.
@@ -335,10 +359,15 @@ def sherlock(
# Store future in data for access later
net_info["request_future"] = future
# Reset identify for tor (if needed)
if unique_tor:
underlying_request.reset_identity()
# Add this site's results into final dictionary with all the other results.
results_total[social_network] = results_site
# Open the file containing account links
# Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses
for social_network, net_info in site_data.items():
# Retrieve results again
results_site = results_total.get(social_network)
@@ -352,8 +381,6 @@ def sherlock(
# Get the expected error type
error_type = net_info["errorType"]
if isinstance(error_type, str):
error_type: list[str] = [error_type]
# Retrieve future and ensure it has finished
future = net_info["request_future"]
@@ -398,60 +425,58 @@ def sherlock(
elif any(hitMsg in r.text for hitMsg in WAFHitMsgs):
query_status = QueryStatus.WAF
else:
if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type):
error_context = f"Unknown error type '{error_type}' for {social_network}"
query_status = QueryStatus.UNKNOWN
elif error_type == "message":
# error_flag True denotes no error found in the HTML
# error_flag False denotes error found in the HTML
error_flag = True
errors = net_info.get("errorMsg")
# errors will hold the error message
# it can be string or list
# by isinstance method we can detect that
# and handle the case for strings as normal procedure
# and if its list we can iterate the errors
if isinstance(errors, str):
# Checks if the error message is in the HTML
# if error is present we will set flag to False
if errors in r.text:
error_flag = False
else:
if "message" in error_type:
# error_flag True denotes no error found in the HTML
# error_flag False denotes error found in the HTML
error_flag = True
errors = net_info.get("errorMsg")
# errors will hold the error message
# it can be string or list
# by isinstance method we can detect that
# and handle the case for strings as normal procedure
# and if its list we can iterate the errors
if isinstance(errors, str):
# Checks if the error message is in the HTML
# if error is present we will set flag to False
if errors in r.text:
error_flag = False
else:
# If it's list, it will iterate all the error message
for error in errors:
if error in r.text:
error_flag = False
break
if error_flag:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
# If it's list, it will iterate all the error message
for error in errors:
if error in r.text:
error_flag = False
break
if error_flag:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
elif error_type == "status_code":
error_codes = net_info.get("errorCode")
query_status = QueryStatus.CLAIMED
if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE:
error_codes = net_info.get("errorCode")
query_status = QueryStatus.CLAIMED
# Type consistency, allowing for both singlets and lists in manifest
if isinstance(error_codes, int):
error_codes = [error_codes]
# Type consistency, allowing for both singlets and lists in manifest
if isinstance(error_codes, int):
error_codes = [error_codes]
if error_codes is not None and r.status_code in error_codes:
query_status = QueryStatus.AVAILABLE
elif r.status_code >= 300 or r.status_code < 200:
query_status = QueryStatus.AVAILABLE
if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE:
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
# code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect).
if 200 <= r.status_code < 300:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
if error_codes is not None and r.status_code in error_codes:
query_status = QueryStatus.AVAILABLE
elif r.status_code >= 300 or r.status_code < 200:
query_status = QueryStatus.AVAILABLE
elif error_type == "response_url":
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
# code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect).
if 200 <= r.status_code < 300:
query_status = QueryStatus.CLAIMED
else:
query_status = QueryStatus.AVAILABLE
else:
# It should be impossible to ever get here...
raise ValueError(
f"Unknown Error Type '{error_type}' for " f"site '{social_network}'"
)
if dump_response:
print("+++++++++++++++++++++")
@@ -571,6 +596,22 @@ def main():
dest="output",
help="If using single username, the output of the result will be saved to this file.",
)
parser.add_argument(
"--tor",
"-t",
action="store_true",
dest="tor",
default=False,
help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.",
)
parser.add_argument(
"--unique-tor",
"-u",
action="store_true",
dest="unique_tor",
default=False,
help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.",
)
parser.add_argument(
"--csv",
action="store_true",
@@ -679,11 +720,11 @@ def main():
)
parser.add_argument(
"--txt",
"--no-txt",
action="store_true",
dest="output_txt",
dest="no_txt",
default=False,
help="Enable creation of a txt file",
help="Disable creation of a txt file",
)
parser.add_argument(
@@ -701,7 +742,7 @@ def main():
# Check for newer version of Sherlock. If it exists, let the user know about it
try:
latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text
latest_release_raw = requests.get(forge_api_latest_release).text
latest_release_json = json_loads(latest_release_raw)
latest_remote_tag = latest_release_json["tag_name"]
@@ -714,10 +755,22 @@ def main():
except Exception as error:
print(f"A problem occurred while checking for an update: {error}")
# Argument check
# TODO regex check on args.proxy
if args.tor and (args.proxy is not None):
raise Exception("Tor and Proxy cannot be set at the same time.")
# Make prompts
if args.proxy is not None:
print("Using the proxy: " + args.proxy)
if args.tor or args.unique_tor:
print("Using Tor to make requests")
print(
"Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors."
)
if args.no_color:
# Disable color output.
init(strip=True, convert=False)
@@ -749,7 +802,7 @@ def main():
if args.json_file.isnumeric():
pull_number = args.json_file
pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}"
pull_request_raw = requests.get(pull_url, timeout=10).text
pull_request_raw = requests.get(pull_url).text
pull_request_json = json_loads(pull_request_raw)
# Check if it's a valid pull request
@@ -818,6 +871,8 @@ def main():
username,
site_data,
query_notify,
tor=args.tor,
unique_tor=args.unique_tor,
dump_response=args.dump_response,
proxy=args.proxy,
timeout=args.timeout,
@@ -833,7 +888,7 @@ def main():
else:
result_file = f"{username}.txt"
if args.output_txt:
if not args.no_txt:
with open(result_file, "w", encoding="utf-8") as file:
exists_counter = 0
for website_name in results:
@@ -918,8 +973,8 @@ def main():
{
"username": usernames,
"name": names,
"url_main": [f'=HYPERLINK(\"{u}\")' for u in url_main],
"url_user": [f'=HYPERLINK(\"{u}\")' for u in url_user],
"url_main": url_main,
"url_user": url_user,
"exists": exists,
"http_status": http_status,
"response_time_s": response_time_s,
+8 -3
View File
@@ -8,7 +8,7 @@ import requests
import secrets
MANIFEST_URL = "https://data.sherlockproject.xyz"
MANIFEST_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/master/sherlock_project/resources/data.json"
EXCLUSIONS_URL = "https://raw.githubusercontent.com/sherlock-project/sherlock/refs/heads/exclusions/false_positive_exclusions.txt"
class SiteInformation:
@@ -121,10 +121,15 @@ class SitesInformation:
# users from creating issue about false positives which has already been fixed or having outdated data
data_file_path = MANIFEST_URL
# Ensure that specified data file has correct extension.
if not data_file_path.lower().endswith(".json"):
raise FileNotFoundError(f"Incorrect JSON file extension for data file '{data_file_path}'.")
# if "http://" == data_file_path[:7].lower() or "https://" == data_file_path[:8].lower():
if data_file_path.lower().startswith("http"):
# Reference is to a URL.
try:
response = requests.get(url=data_file_path, timeout=30)
response = requests.get(url=data_file_path)
except Exception as error:
raise FileNotFoundError(
f"Problem while attempting to access data file URL '{data_file_path}': {error}"
@@ -161,7 +166,7 @@ class SitesInformation:
if honor_exclusions:
try:
response = requests.get(url=EXCLUSIONS_URL, timeout=10)
response = requests.get(url=EXCLUSIONS_URL)
if response.status_code == 200:
exclusions = response.text.splitlines()
exclusions = [exclusion.strip() for exclusion in exclusions]
-47
View File
@@ -1,47 +0,0 @@
"""Tests for handling usernames with special/unicode characters."""
from concurrent.futures import Future
from sherlock_project.sherlock import get_response
def _make_future_with_exception(exc):
"""Create a Future that raises the given exception."""
future = Future()
future.set_exception(exc)
return future
def test_get_response_handles_unicode_decode_error():
"""Regression test for issue #2730.
Usernames with special characters (e.g. 'Émile') can trigger a
UnicodeDecodeError inside the requests library during redirect
handling. This must not crash the program.
"""
future = _make_future_with_exception(
UnicodeDecodeError("utf-8", b"\xe9", 0, 1, "invalid continuation byte")
)
response, error_context, exception_text = get_response(
request_future=future,
error_type=["status_code"],
social_network="TestSite",
)
assert response is None
assert error_context == "Encoding Error"
assert "utf-8" in exception_text
def test_get_response_handles_unicode_encode_error():
"""UnicodeEncodeError should also be caught (subclass of UnicodeError)."""
future = _make_future_with_exception(
UnicodeEncodeError("ascii", "É", 0, 1, "ordinal not in range(128)")
)
response, error_context, exception_text = get_response(
request_future=future,
error_type=["status_code"],
social_network="TestSite",
)
assert response is None
assert error_context == "Encoding Error"
assert "ascii" in exception_text
+3 -3
View File
@@ -4,7 +4,7 @@ from sherlock_interactives import Interactives
from sherlock_interactives import InteractivesSubprocessError
def test_remove_nsfw(sites_obj):
nsfw_target: str = 'Xvideos'
nsfw_target: str = 'Pornhub'
assert nsfw_target in {site.name: site.information for site in sites_obj}
sites_obj.remove_nsfw_sites()
assert nsfw_target not in {site.name: site.information for site in sites_obj}
@@ -12,8 +12,8 @@ def test_remove_nsfw(sites_obj):
# Parametrized sites should *not* include Motherless, which is acting as the control
@pytest.mark.parametrize('nsfwsites', [
['Xvideos'],
['Xvideos', 'Erome'],
['Pornhub'],
['Pornhub', 'Xvideos'],
])
def test_nsfw_explicit_selection(sites_obj, nsfwsites):
for site in nsfwsites:
-1
View File
@@ -16,7 +16,6 @@ def set_pattern_upper_bound(pattern: str, upper_bound: int = FALSE_POSITIVE_QUAN
"""Set upper bound for regex patterns that use quantifiers such as `+` `*` or `{n,}`."""
def replace_upper_bound(match: re.Match) -> str: # type: ignore
lower_bound: int = int(match.group(1)) if match.group(1) else 0 # type: ignore
nonlocal upper_bound
upper_bound = upper_bound if lower_bound < upper_bound else lower_bound # type: ignore # noqa: F823
return f'{{{lower_bound},{upper_bound}}}'