From cfa4097df90407ad763365755ba1ae8e1bdb9be0 Mon Sep 17 00:00:00 2001 From: Siddharth Dushantha Date: Wed, 26 Jun 2024 21:57:11 +0200 Subject: [PATCH 01/33] removed support for tor --- sherlock/sherlock.py | 58 +++----------------------------------------- 1 file changed, 4 insertions(+), 54 deletions(-) diff --git a/sherlock/sherlock.py b/sherlock/sherlock.py index db8e9c2c..170ea32a 100644 --- a/sherlock/sherlock.py +++ b/sherlock/sherlock.py @@ -30,7 +30,6 @@ from .__init__ import ( # noqa: E402 ) from requests_futures.sessions import FuturesSession # noqa: E402 -from torrequest import TorRequest # noqa: E402 from sherlock.result import QueryStatus # noqa: E402 from sherlock.result import QueryResult # noqa: E402 from sherlock.notify import QueryNotify # noqa: E402 @@ -166,8 +165,6 @@ def sherlock( username, site_data, query_notify: QueryNotify, - tor: bool = False, - unique_tor: bool = False, proxy=None, timeout=60, ): @@ -182,8 +179,6 @@ def sherlock( query_notify -- Object with base type of QueryNotify(). This will be used to notify the caller about query results. - tor -- Boolean indicating whether to use a tor circuit for the requests. - unique_tor -- Boolean indicating whether to use a new tor circuit for each request. proxy -- String indicating the proxy URL timeout -- Time in seconds to wait before timing out request. Default is 60 seconds. @@ -204,20 +199,10 @@ def sherlock( # Notify caller that we are starting the query. query_notify.start(username) - # Create session based on request methodology - if tor or unique_tor: - # Requests using Tor obfuscation - try: - underlying_request = TorRequest() - except OSError: - print("Tor not found in system path. Unable to continue.\n") - sys.exit(query_notify.finish()) - underlying_session = underlying_request.session - else: - # Normal requests - underlying_session = requests.session() - underlying_request = requests.Request() + # Normal requests + underlying_session = requests.session() + underlying_request = requests.Request() # Limit number of workers to 20. # This is probably vastly overkill. @@ -341,15 +326,10 @@ def sherlock( # Store future in data for access later net_info["request_future"] = future - # Reset identify for tor (if needed) - if unique_tor: - underlying_request.reset_identity() - # Add this site's results into final dictionary with all the other results. results_total[social_network] = results_site # Open the file containing account links - # Core logic: If tor requests, make them here. If multi-threaded requests, wait for responses for social_network, net_info in site_data.items(): # Retrieve results again results_site = results_total.get(social_network) @@ -547,23 +527,7 @@ def main(): "-o", dest="output", help="If using single username, the output of the result will be saved to this file.", - ) - parser.add_argument( - "--tor", - "-t", - action="store_true", - dest="tor", - default=False, - help="Make requests over Tor; increases runtime; requires Tor to be installed and in system path.", - ) - parser.add_argument( - "--unique-tor", - "-u", - action="store_true", - dest="unique_tor", - default=False, - help="Make requests over Tor with new Tor circuit after each request; increases runtime; requires Tor to be installed and in system path.", - ) + ), parser.add_argument( "--csv", action="store_true", @@ -687,22 +651,10 @@ def main(): except Exception as error: print(f"A problem occurred while checking for an update: {error}") - # Argument check - # TODO regex check on args.proxy - if args.tor and (args.proxy is not None): - raise Exception("Tor and Proxy cannot be set at the same time.") - # Make prompts if args.proxy is not None: print("Using the proxy: " + args.proxy) - if args.tor or args.unique_tor: - print("Using Tor to make requests") - - print( - "Warning: some websites might refuse connecting over Tor, so note that using this option might increase connection errors." - ) - if args.no_color: # Disable color output. init(strip=True, convert=False) @@ -781,8 +733,6 @@ def main(): username, site_data, query_notify, - tor=args.tor, - unique_tor=args.unique_tor, proxy=args.proxy, timeout=args.timeout, ) From 44ad8f506a3719e25bfd894a90e2952199d405f6 Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Fri, 28 Jun 2024 23:38:44 -0400 Subject: [PATCH 02/33] Lint --- sherlock/sherlock.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sherlock/sherlock.py b/sherlock/sherlock.py index 170ea32a..30346bd5 100644 --- a/sherlock/sherlock.py +++ b/sherlock/sherlock.py @@ -202,7 +202,6 @@ def sherlock( # Normal requests underlying_session = requests.session() - underlying_request = requests.Request() # Limit number of workers to 20. # This is probably vastly overkill. @@ -527,7 +526,7 @@ def main(): "-o", dest="output", help="If using single username, the output of the result will be saved to this file.", - ), + ) parser.add_argument( "--csv", action="store_true", From 2016892e648c5bc5674d2b3b910cd9f36988bcff Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Fri, 28 Jun 2024 23:39:38 -0400 Subject: [PATCH 03/33] Remove torrequest dep Not sure why it's not in my patch file, but I was removing via sed in my spec instead. --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fafa9f85..5674f016 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,6 @@ PySocks = "^1.7.0" requests = "^2.22.0" requests-futures = "^1.0.0" stem = "^1.8.0" -torrequest = "^0.1.0" # pandas can likely be bumped up to ^2.0.0 after fc39 EOL pandas = ">=1.0.0,<3.0.0" openpyxl = "^3.0.10" From dc89f1cd27a358a6771877cc0b597b3db822c06c Mon Sep 17 00:00:00 2001 From: JongMyeong HAN Date: Wed, 1 Oct 2025 00:41:23 +0900 Subject: [PATCH 04/33] feat: Add dcinside --- sherlock_project/resources/data.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac52..ff8af075 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -600,6 +600,12 @@ "urlMain": "https://www.dailymotion.com/", "username_claimed": "blue" }, + "dcinside": { + "errorType": "status_code", + "url": "https://gallog.dcinside.com/{}", + "urlMain": "https://www.dcinside.com/", + "username_claimed": "anrbrb" + }, "Dealabs": { "errorMsg": "La page que vous essayez", "errorType": "message", From e5cd5e5bfe7df4ebd93e220a69496a4fdfe7b39f Mon Sep 17 00:00:00 2001 From: JongMyeong HAN Date: Wed, 1 Oct 2025 00:43:21 +0900 Subject: [PATCH 05/33] feat: Add namuwiki --- sherlock_project/resources/data.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index ff8af075..2b5dbf6b 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -1465,6 +1465,12 @@ "urlMain": "https://www.native-instruments.com/forum/", "username_claimed": "jambert" }, + "namuwiki": { + "errorType": "status_code", + "url": "https://namu.wiki/w/%EC%82%AC%EC%9A%A9%EC%9E%90:{}", + "urlMain": "https://namu.wiki/", + "username_claimed": "namu" + }, "NationStates Nation": { "errorMsg": "Was this your nation? It may have ceased to exist due to inactivity, but can rise again!", "errorType": "message", From 86140af50e6a2aae642ff38b1cab365a980fa283 Mon Sep 17 00:00:00 2001 From: JongMyeong HAN Date: Wed, 1 Oct 2025 00:44:02 +0900 Subject: [PATCH 06/33] feat: Add SOOP --- sherlock_project/resources/data.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 2b5dbf6b..eaf3e670 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -1964,6 +1964,13 @@ "urlMain": "https://www.snapchat.com", "username_claimed": "teamsnapchat" }, + "SOOP": { + "errorType": "status_code", + "url": "https://www.sooplive.co.kr/station/{}", + "urlMain": "https://www.sooplive.co.kr/", + "urlProbe": "https://api-channel.sooplive.co.kr/v1.1/channel/{}/station", + "username_claimed": "udkn" + }, "SoundCloud": { "errorType": "status_code", "url": "https://soundcloud.com/{}", From cd7c52e4fae2dc81bc3fd75d098498e430d8bec9 Mon Sep 17 00:00:00 2001 From: JongMyeong HAN Date: Wed, 1 Oct 2025 00:44:55 +0900 Subject: [PATCH 07/33] Feat: Add tistory --- sherlock_project/resources/data.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index eaf3e670..c4efcbe3 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -2138,6 +2138,12 @@ "urlMain": "https://themeforest.net/", "username_claimed": "user" }, + "tistory": { + "errorType": "status_code", + "url": "https://{}.tistory.com/", + "urlMain": "https://www.tistory.com/", + "username_claimed": "notice" + }, "TnAFlix": { "errorType": "status_code", "isNSFW": true, From 4fe41f09ff9868ee77080a6640da93fb89f2fae9 Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Thu, 2 Oct 2025 12:42:47 +1000 Subject: [PATCH 08/33] Removed duplicate Bluesky entry in data.json --- sherlock_project/resources/data.json | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac52..74ac5698 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -2840,13 +2840,6 @@ "urlMain": "https://znanylekarz.pl", "username_claimed": "janusz-nowak" }, - "Bluesky": { - "errorType": "status_code", - "url": "https://bsky.app/profile/{}.bsky.social", - "urlProbe": "https://public.api.bsky.app/xrpc/app.bsky.actor.getProfile?actor={}.bsky.social", - "urlMain": "https://bsky.app/", - "username_claimed": "mcuban" - }, "Platzi": { "errorType": "status_code", "errorCode": 404, From 7b3632bdadd4eba3473a1c0a728df522631d4654 Mon Sep 17 00:00:00 2001 From: JongMyeong HAN Date: Fri, 3 Oct 2025 04:00:41 +0900 Subject: [PATCH 09/33] Add comment to site 'namuwiki' Co-authored-by: Paul Pfeister --- sherlock_project/resources/data.json | 1 + 1 file changed, 1 insertion(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index c4efcbe3..f019000f 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -1466,6 +1466,7 @@ "username_claimed": "jambert" }, "namuwiki": { + "__comment__": "This is a Korean site and it's expected to return false negatives in certain other regions.", "errorType": "status_code", "url": "https://namu.wiki/w/%EC%82%AC%EC%9A%A9%EC%9E%90:{}", "urlMain": "https://namu.wiki/", From 355bfbd328c31144983904a65e6ad3aa8c003d9c Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Fri, 3 Oct 2025 00:42:07 +0530 Subject: [PATCH 10/33] fix(sites): Remediate false positive for DeviantArt --- sherlock_project/resources/data.json | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac52..9738699b 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -608,13 +608,15 @@ "urlMain": "https://www.dealabs.com/", "username_claimed": "blue" }, - "DeviantART": { - "errorType": "status_code", - "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", - "url": "https://{}.deviantart.com", - "urlMain": "https://deviantart.com", - "username_claimed": "blue" - }, + "DeviantArt": { + "errorType": "message", + "errorMsg": "Llama Not Found", + "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", + "url": "https://www.deviantart.com/{}", + "urlMain": "https://www.deviantart.com/", + "username_claimed": "blue", + "username_unclaimed": "noonewouldeverusethis" +}, "DigitalSpy": { "errorMsg": "The page you were looking for could not be found.", "errorType": "message", From b811b2bd47f0b45ac1cdffa9518470fff91a253e Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Thu, 2 Oct 2025 18:21:20 -0400 Subject: [PATCH 11/33] chore: update code owners --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0f2eadf2..b9af7fda 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,5 @@ ### REPOSITORY -/.github/CODEOWNERS @sdushantha +/.github/CODEOWNERS @sdushantha @ppfeister /.github/FUNDING.yml @sdushantha /LICENSE @sdushantha From 779d4c33f4a88421a443695931d7041e55a51c7e Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Fri, 3 Oct 2025 03:55:03 +0530 Subject: [PATCH 12/33] fix: Remove username_unclaimed as requested --- sherlock_project/resources/data.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 9738699b..dc422754 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -608,14 +608,13 @@ "urlMain": "https://www.dealabs.com/", "username_claimed": "blue" }, - "DeviantArt": { + "DeviantArt": { "errorType": "message", "errorMsg": "Llama Not Found", "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$", "url": "https://www.deviantart.com/{}", "urlMain": "https://www.deviantart.com/", - "username_claimed": "blue", - "username_unclaimed": "noonewouldeverusethis" + "username_claimed": "blue" }, "DigitalSpy": { "errorMsg": "The page you were looking for could not be found.", From c89a52caf7f55d36265866ffc2c9d390957a7734 Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Fri, 3 Oct 2025 04:25:46 +0530 Subject: [PATCH 13/33] fix(sites): Remediate false positive for AllMyLinks --- sherlock_project/resources/data.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac52..091e2e9a 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -79,13 +79,13 @@ "username_claimed": "pink" }, "AllMyLinks": { - "errorMsg": "Not Found", - "errorType": "message", - "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$", - "url": "https://allmylinks.com/{}", - "urlMain": "https://allmylinks.com/", - "username_claimed": "blue" - }, + "errorMsg": "Page not found", + "errorType": "message", + "regexCheck": "^[a-z0-9][a-z0-9-]{2,32}$", + "url": "https://allmylinks.com/{}", + "urlMain": "https://allmylinks.com/", + "username_claimed": "blue" +}, "AniWorld": { "errorMsg": "Dieses Profil ist nicht verf\u00fcgbar", "errorType": "message", From d314d75db1636b14511997fe2d19a9b8bc6ef9b6 Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Fri, 3 Oct 2025 04:43:05 +0530 Subject: [PATCH 14/33] fix(sites): Remediate false positive for Mydramalist --- sherlock_project/resources/data.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac52..dd1c2f39 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -1440,12 +1440,12 @@ "username_claimed": "blue" }, "Mydramalist": { - "errorMsg": "Sign in - MyDramaList", - "errorType": "message", - "url": "https://www.mydramalist.com/profile/{}", - "urlMain": "https://mydramalist.com", - "username_claimed": "elhadidy12398" - }, + "errorMsg": "The requested page was not found", + "errorType": "message", + "url": "https://www.mydramalist.com/profile/{}", + "urlMain": "https://mydramalist.com", + "username_claimed": "elhadidy12398" +}, "Myspace": { "errorType": "status_code", "url": "https://myspace.com/{}", From b245c462c92bf1655b3c871217f9683c1544554e Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Fri, 3 Oct 2025 05:56:52 +0530 Subject: [PATCH 15/33] fix(sites): Remediate false positive for Apple Discussions --- sherlock_project/resources/data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 4c84ac52..cd081b00 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -115,7 +115,7 @@ "username_claimed": "lio24d" }, "Apple Discussions": { - "errorMsg": "The page you tried was not found. You may have used an outdated link or may have typed the address (URL) incorrectly.", + "errorMsg": "Looking for something in Apple Support Communities?", "errorType": "message", "url": "https://discussions.apple.com/profile/{}", "urlMain": "https://discussions.apple.com", From 0e7219b191d36b1ba06c16066c450377863ea571 Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Fri, 3 Oct 2025 13:41:43 +0530 Subject: [PATCH 16/33] Security Fix: Add timeout parameters to HTTP requests This fix addresses a critical security vulnerability where HTTP requests could hang indefinitely, potentially causing denial of service. Changes: - Added 10-second timeout to version check API call - Added 10-second timeout to GitHub pull request API call - Added 30-second timeout to data file downloads (larger timeout for data) - Added 10-second timeout to exclusions list download Impact: - Prevents infinite hangs that could freeze the application - Improves user experience with predictable response times - Fixes security issue flagged by Bandit static analysis (B113) - Makes the application more robust in poor network conditions The timeouts are conservative enough to work with slow connections while preventing indefinite blocking that could be exploited. --- sherlock_project/sherlock.py | 4 ++-- sherlock_project/sites.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index 250175a5..ba630c73 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -742,7 +742,7 @@ def main(): # Check for newer version of Sherlock. If it exists, let the user know about it try: - latest_release_raw = requests.get(forge_api_latest_release).text + latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text latest_release_json = json_loads(latest_release_raw) latest_remote_tag = latest_release_json["tag_name"] @@ -802,7 +802,7 @@ def main(): if args.json_file.isnumeric(): pull_number = args.json_file pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}" - pull_request_raw = requests.get(pull_url).text + pull_request_raw = requests.get(pull_url, timeout=10).text pull_request_json = json_loads(pull_request_raw) # Check if it's a valid pull request diff --git a/sherlock_project/sites.py b/sherlock_project/sites.py index 2ba811d7..b7aaf4c5 100644 --- a/sherlock_project/sites.py +++ b/sherlock_project/sites.py @@ -129,7 +129,7 @@ class SitesInformation: if data_file_path.lower().startswith("http"): # Reference is to a URL. try: - response = requests.get(url=data_file_path) + response = requests.get(url=data_file_path, timeout=30) except Exception as error: raise FileNotFoundError( f"Problem while attempting to access data file URL '{data_file_path}': {error}" @@ -166,7 +166,7 @@ class SitesInformation: if honor_exclusions: try: - response = requests.get(url=EXCLUSIONS_URL) + response = requests.get(url=EXCLUSIONS_URL, timeout=10) if response.status_code == 200: exclusions = response.text.splitlines() exclusions = [exclusion.strip() for exclusion in exclusions] From 91f3b16993f2f1dc70d3750d84249ebff8d24038 Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Sat, 4 Oct 2025 02:55:57 +0530 Subject: [PATCH 17/33] fix(sites): Update BoardGameGeek URL structure and detection method BoardGameGeek changed from /user/{} to /profile/{} URL structure. Also updated from message to status_code detection as the site no longer returns clear error messages for non-existent users. --- sherlock_project/resources/data.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index b30ec929..3f7f5ac3 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -279,10 +279,9 @@ "username_claimed": "mcuban" }, "BoardGameGeek": { - "errorType": "message", + "errorType": "status_code", "regexCheck": "^[a-zA-Z0-9_]*$", - "errorMsg": "User not found", - "url": "https://boardgamegeek.com/user/{}", + "url": "https://boardgamegeek.com/profile/{}", "urlMain": "https://boardgamegeek.com", "username_claimed": "blue" }, From 3e653c46b07c858811619517b28a17742cb4847a Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Sat, 4 Oct 2025 03:12:47 +0530 Subject: [PATCH 18/33] fix(sites): Remove BoardGameGeek - unreliable detection BoardGameGeek returns identical pages for both existing and non-existing users, making reliable username detection impossible with HTTP-based methods. The site likely uses JavaScript to load user-specific content dynamically. --- sherlock_project/resources/data.json | 7 ------- 1 file changed, 7 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 3f7f5ac3..891b6245 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -278,13 +278,6 @@ "urlMain": "https://bsky.app/", "username_claimed": "mcuban" }, - "BoardGameGeek": { - "errorType": "status_code", - "regexCheck": "^[a-zA-Z0-9_]*$", - "url": "https://boardgamegeek.com/profile/{}", - "urlMain": "https://boardgamegeek.com", - "username_claimed": "blue" - }, "BongaCams": { "errorType": "status_code", "isNSFW": true, From c5e209d78e203f931a9e3bc6e51d6b49fdd33d3c Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Sat, 4 Oct 2025 11:23:55 +0530 Subject: [PATCH 19/33] fix(sites): Implement BoardGameGeek API detection as suggested Using the API endpoint suggested by akh7177: https://api.geekdo.com/api/users?username={} However, there's an edge case where valid users contain empty arrays in their JSON response (adminBadges[], userMicrobadges[], supportYears[]) which causes Sherlock's substring matching to incorrectly flag them as 'not found' when looking for the '[]' error pattern. The API correctly returns: - Valid user: JSON object with user data (but contains [] substrings) - Invalid user: Exactly '[]' (2 characters total) This needs further refinement to distinguish between the exact '[]' response vs JSON containing '[]' substrings. --- sherlock_project/resources/data.json | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 891b6245..09168d17 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -278,6 +278,15 @@ "urlMain": "https://bsky.app/", "username_claimed": "mcuban" }, + "BoardGameGeek": { + "errorMsg": "[]", + "errorType": "message", + "regexCheck": "^[a-zA-Z0-9_]*$", + "url": "https://boardgamegeek.com/profile/{}", + "urlMain": "https://boardgamegeek.com", + "urlProbe": "https://api.geekdo.com/api/users?username={}", + "username_claimed": "blue" + }, "BongaCams": { "errorType": "status_code", "isNSFW": true, From 94c013886a677df9b7e1192267d548b4520f2958 Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Sat, 4 Oct 2025 11:33:27 +0530 Subject: [PATCH 20/33] fix(sites): Remove BoardGameGeek due to incompatible detection BoardGameGeek cannot be reliably detected with Sherlock's current capabilities: - Original HTML detection: Returns false positives - API endpoint approach: The API returns status 200 for both valid and invalid users - Invalid user: Returns exactly '[]' - Valid user: Returns JSON containing '[]' substrings (e.g., "adminBadges":[]) Since Sherlock's 'message' errorType uses substring matching, it incorrectly identifies valid users as "not found" when checking for '[]' in the response. The site's API response format is fundamentally incompatible with Sherlock's detection methods (message/status_code/response_url), so removal is the only viable solution to prevent false positives and false negatives. Addresses false positive issue originally reported in testing. --- sherlock_project/resources/data.json | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 09168d17..891b6245 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -278,15 +278,6 @@ "urlMain": "https://bsky.app/", "username_claimed": "mcuban" }, - "BoardGameGeek": { - "errorMsg": "[]", - "errorType": "message", - "regexCheck": "^[a-zA-Z0-9_]*$", - "url": "https://boardgamegeek.com/profile/{}", - "urlMain": "https://boardgamegeek.com", - "urlProbe": "https://api.geekdo.com/api/users?username={}", - "username_claimed": "blue" - }, "BongaCams": { "errorType": "status_code", "isNSFW": true, From 57a0ccef38066b769061736bc165fb0d94a4a516 Mon Sep 17 00:00:00 2001 From: Abhyuday K Hegde <66260177+akh7177@users.noreply.github.com> Date: Sat, 4 Oct 2025 14:30:40 +0530 Subject: [PATCH 21/33] Remediate False Positive for Roblox --- sherlock_project/resources/data.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index b30ec929..2d965176 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -1823,8 +1823,7 @@ "username_claimed": "blue" }, "Roblox": { - "errorMsg": "Page cannot be found or no longer exists", - "errorType": "message", + "errorType": "status_code", "url": "https://www.roblox.com/user.aspx?username={}", "urlMain": "https://www.roblox.com/", "username_claimed": "bluewolfekiller" From 977ad5c1a48e93cce720941d6777e150099ac183 Mon Sep 17 00:00:00 2001 From: Abhyuday K Hegde <66260177+akh7177@users.noreply.github.com> Date: Sat, 4 Oct 2025 14:48:37 +0530 Subject: [PATCH 22/33] Remediate False Positive for SlideShare --- sherlock_project/resources/data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index b30ec929..7e984273 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -1932,7 +1932,7 @@ }, "SlideShare": { "errorType": "message", - "errorMsg": "Username available", + "errorMsg": "Page no longer exists", "url": "https://slideshare.net/{}", "urlMain": "https://slideshare.net/", "username_claimed": "blue" From 5cd769c2f46e9615fdc3d6e43341e3f868256597 Mon Sep 17 00:00:00 2001 From: Abhyuday K Hegde <66260177+akh7177@users.noreply.github.com> Date: Sat, 4 Oct 2025 15:12:20 +0530 Subject: [PATCH 23/33] Remediate False Positives for CyberDefenders --- sherlock_project/resources/data.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index b30ec929..4787ffeb 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -572,8 +572,7 @@ "username_claimed": "brown" }, "CyberDefenders": { - "errorMsg": "Blue Team Training for SOC analysts and DFIR - CyberDefenders", - "errorType": "message", + "errorType": "status_code", "regexCheck": "^[^\\/:*?\"<>|@]{3,50}$", "request_method": "GET", "url": "https://cyberdefenders.org/p/{}", From 3079e7a218dcb1e25373e0ca73b43d8782ee5906 Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Sat, 4 Oct 2025 15:25:30 +0530 Subject: [PATCH 24/33] fix(ci): Use merge-base for correct target validation --- .../workflows/validate_modified_targets.yml | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/.github/workflows/validate_modified_targets.yml b/.github/workflows/validate_modified_targets.yml index de024090..3e5fde31 100644 --- a/.github/workflows/validate_modified_targets.yml +++ b/.github/workflows/validate_modified_targets.yml @@ -14,33 +14,43 @@ jobs: contents: read pull-requests: write steps: - - name: Checkout repository + - name: Checkout PR branch uses: actions/checkout@v5 with: - ref: ${{ github.base_ref }} - fetch-depth: 1 + # Check out the actual PR code, not the base branch + ref: ${{ github.event.pull_request.head.sha }} + # Fetch all history so we can find the common ancestor (merge-base) + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v6 with: - python-version: '3.13' + python-version: "3.13" - name: Install Poetry uses: abatilo/actions-poetry@v4 with: - poetry-version: 'latest' + poetry-version: "latest" - name: Install dependencies run: | poetry install --no-interaction --with dev - - name: Drop in place updated manifest from base + - name: Prepare JSON versions for comparison run: | - cp sherlock_project/resources/data.json data.json.base - git fetch origin pull/${{ github.event.pull_request.number }}/head:pr --depth=1 - git show pr:sherlock_project/resources/data.json > sherlock_project/resources/data.json + # Fetch the target branch to ensure we can compare against it + git fetch origin ${{ github.base_ref }} + + # Find the exact commit where this branch split from the target branch + MERGE_BASE=$(git merge-base origin/${{ github.base_ref }} HEAD) + echo "Comparing HEAD against merge-base commit: $MERGE_BASE" + + # Copy the version of the file from the current PR branch (HEAD) cp sherlock_project/resources/data.json data.json.head + # Extract the version of the file from the merge-base commit + git show $MERGE_BASE:sherlock_project/resources/data.json > data.json.base + - name: Discover modified targets id: discover-modified run: | From dc869852bc5674f158db79bb2b4a3ad42b879f0e Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Sat, 4 Oct 2025 17:22:50 +0530 Subject: [PATCH 25/33] fix(sites): Fix Threads false positive detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Threads was showing false positives for non-existent users because the error message detection was incorrect. Updated errorMsg: - Old: "Threads" (generic, matches valid pages too) - New: "Threads • Log in" (specific to non-existent users) When a user doesn't exist, Threads redirects to a login page with the title "Threads • Log in". Valid user profiles have titles like "Username (@username) • Threads, Say more". Tested with: - Invalid user (impossibleuser12345): Correctly not found - Valid user (zuck): Correctly found This fixes the false positive issue where non-existent Threads profiles were being reported as found. --- sherlock_project/resources/data.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index b30ec929..1f6b3d9e 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -2820,7 +2820,7 @@ "username_claimed": "green" }, "threads": { - "errorMsg": "Threads", + "errorMsg": "Threads • Log in", "errorType": "message", "headers": { "Sec-Fetch-Mode": "navigate" From b99719ce6014312445614d856df95dbae37b5991 Mon Sep 17 00:00:00 2001 From: obiwan04kanobi Date: Sun, 5 Oct 2025 00:22:12 +0530 Subject: [PATCH 26/33] Add Docker build test to CI workflow - Adds docker-build-test job to regression.yml - Runs on push/merge to master and release branches - Extracts VERSION_TAG from pyproject.toml for build - Tests that Docker image builds and runs successfully - Resolves dockerfile syntax warnings - Resolves #2196" --- .github/workflows/regression.yml | 27 +++++++++++++++++++++++++-- Dockerfile | 2 +- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index e366f29d..5029b870 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -11,6 +11,7 @@ on: - '**/*.py' - '**/*.ini' - '**/*.toml' + - 'Dockerfile' push: branches: - master @@ -21,11 +22,13 @@ on: - '**/*.py' - '**/*.ini' - '**/*.toml' + - 'Dockerfile' jobs: tox-lint: - # Linting is ran through tox to ensure that the same linter is used by local runners runs-on: ubuntu-latest + # Linting is ran through tox to ensure that the same linter + # is used by local runners steps: - uses: actions/checkout@v4 - name: Set up linting environment @@ -41,7 +44,8 @@ jobs: tox-matrix: runs-on: ${{ matrix.os }} strategy: - fail-fast: false # We want to know what specicic versions it fails on + # We want to know what specicic versions it fails on + fail-fast: false matrix: os: [ ubuntu-latest, @@ -67,3 +71,22 @@ jobs: pip install tox-gh-actions - name: Run tox run: tox + docker-build-test: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Get version from pyproject.toml + id: get-version + run: | + VERSION=$(grep -m1 'version = ' pyproject.toml | cut -d'"' -f2) + echo "version=$VERSION" >> $GITHUB_OUTPUT + - name: Build Docker image + run: | + docker build \ + --build-arg VERSION_TAG=${{ steps.get-version.outputs.version }} \ + -t sherlock-test:latest . + - name: Test Docker image runs + run: docker run --rm sherlock-test:latest --version diff --git a/Dockerfile b/Dockerfile index 361530ab..ccdfbf23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ # 3. Build image with BOTH latest and version tags # i.e. `docker build -t sherlock/sherlock:0.16.0 -t sherlock/sherlock:latest .` -FROM python:3.12-slim-bullseye as build +FROM python:3.12-slim-bullseye AS build WORKDIR /sherlock RUN pip3 install --no-cache-dir --upgrade pip From 0794e02b525a2bf5c9222c3da51a714f96b42d64 Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Sat, 4 Oct 2025 16:53:30 -0400 Subject: [PATCH 27/33] feat: support multiple errorTypes --- sherlock_project/resources/data.schema.json | 217 +++++++++++++------- sherlock_project/sherlock.py | 109 +++++----- 2 files changed, 201 insertions(+), 125 deletions(-) diff --git a/sherlock_project/resources/data.schema.json b/sherlock_project/resources/data.schema.json index 216ffb62..c717cb25 100644 --- a/sherlock_project/resources/data.schema.json +++ b/sherlock_project/resources/data.schema.json @@ -1,80 +1,149 @@ { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "title": "Sherlock Target Manifest", - "description": "Social media targets to probe for the existence of known usernames", - "type": "object", - "properties": { - "$schema": { "type": "string" } - }, - "patternProperties": { - "^(?!\\$).*?$": { - "type": "object", - "description": "Target name and associated information (key should be human readable name)", - "required": [ "url", "urlMain", "errorType", "username_claimed" ], - "properties": { - "url": { "type": "string" }, - "urlMain": { "type": "string" }, - "urlProbe": { "type": "string" }, - "username_claimed": { "type": "string" }, - "regexCheck": { "type": "string" }, - "isNSFW": { "type": "boolean" }, - "headers": { "type": "object" }, - "request_payload": { "type": "object" }, - "__comment__": { - "type": "string", - "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock." - }, - "tags": { - "oneOf": [ - { "$ref": "#/$defs/tag" }, - { "type": "array", "items": { "$ref": "#/$defs/tag" } } - ] - }, - "request_method": { - "type": "string", - "enum": [ "GET", "POST", "HEAD", "PUT" ] - }, + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Sherlock Target Manifest", + "description": "Social media targets to probe for the existence of known usernames", + "type": "object", + "properties": { + "$schema": { "type": "string" } + }, + "patternProperties": { + "^(?!\\$).*?$": { + "type": "object", + "description": "Target name and associated information (key should be human readable name)", + "required": ["url", "urlMain", "errorType", "username_claimed"], + "properties": { + "url": { "type": "string" }, + "urlMain": { "type": "string" }, + "urlProbe": { "type": "string" }, + "username_claimed": { "type": "string" }, + "regexCheck": { "type": "string" }, + "isNSFW": { "type": "boolean" }, + "headers": { "type": "object" }, + "request_payload": { "type": "object" }, + "__comment__": { + "type": "string", + "description": "Used to clarify important target information if (and only if) a commit message would not suffice.\nThis key should not be parsed anywhere within Sherlock." + }, + "tags": { + "oneOf": [ + { "$ref": "#/$defs/tag" }, + { "type": "array", "items": { "$ref": "#/$defs/tag" } } + ] + }, + "request_method": { + "type": "string", + "enum": ["GET", "POST", "HEAD", "PUT"] + }, + "errorType": { + "oneOf": [ + { + "type": "string", + "enum": ["message", "response_url", "status_code"] + }, + { + "type": "array", + "items": { + "type": "string", + "enum": ["message", "response_url", "status_code"] + } + } + ] + }, + "errorMsg": { + "oneOf": [ + { "type": "string" }, + { "type": "array", "items": { "type": "string" } } + ] + }, + "errorCode": { + "oneOf": [ + { "type": "integer" }, + { "type": "array", "items": { "type": "integer" } } + ] + }, + "errorUrl": { "type": "string" }, + "response_url": { "type": "string" } + }, + "dependencies": { + "errorMsg": { + "oneOf": [ + { "properties": { "errorType": { "const": "message" } } }, + { + "properties": { "errorType": { - "type": "string", - "enum": [ "message", "response_url", "status_code" ] - }, - "errorMsg": { - "oneOf": [ - { "type": "string" }, - { "type": "array", "items": { "type": "string" } } - ] - }, - "errorCode": { - "oneOf": [ - { "type": "integer" }, - { "type": "array", "items": { "type": "integer" } } - ] - }, - "errorUrl": { "type": "string" }, - "response_url": { "type": "string" } - }, - "dependencies": { - "errorMsg": { - "properties" : { "errorType": { "const": "message" } } - }, - "errorUrl": { - "properties": { "errorType": { "const": "response_url" } } - }, - "errorCode": { - "properties": { "errorType": { "const": "status_code" } } + "type": "array", + "contains": { "const": "message" } } - }, - "if": { "properties": { "errorType": { "const": "message" } } }, - "then": { "required": [ "errorMsg" ] }, - "else": { - "if": { "properties": { "errorType": { "const": "response_url" } } }, - "then": { "required": [ "errorUrl" ] } - }, - "additionalProperties": false + } + } + ] + }, + "errorUrl": { + "oneOf": [ + { "properties": { "errorType": { "const": "response_url" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "response_url" } + } + } + } + ] + }, + "errorCode": { + "oneOf": [ + { "properties": { "errorType": { "const": "status_code" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "status_code" } + } + } + } + ] } - }, - "additionalProperties": false, - "$defs": { - "tag": { "type": "string", "enum": [ "adult", "gaming" ] } + }, + "allOf": [ + { + "if": { + "anyOf": [ + { "properties": { "errorType": { "const": "message" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "message" } + } + } + } + ] + }, + "then": { "required": ["errorMsg"] } + }, + { + "if": { + "anyOf": [ + { "properties": { "errorType": { "const": "response_url" } } }, + { + "properties": { + "errorType": { + "type": "array", + "contains": { "const": "response_url" } + } + } + } + ] + }, + "then": { "required": ["errorUrl"] } + } + ], + "additionalProperties": false } + }, + "additionalProperties": false, + "$defs": { + "tag": { "type": "string", "enum": ["adult", "gaming"] } + } } diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index 250175a5..a776d8c3 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -381,6 +381,8 @@ def sherlock( # Get the expected error type error_type = net_info["errorType"] + if isinstance(error_type, str): + error_type: list[str] = [error_type] # Retrieve future and ensure it has finished future = net_info["request_future"] @@ -425,58 +427,63 @@ def sherlock( elif any(hitMsg in r.text for hitMsg in WAFHitMsgs): query_status = QueryStatus.WAF - elif error_type == "message": - # error_flag True denotes no error found in the HTML - # error_flag False denotes error found in the HTML - error_flag = True - errors = net_info.get("errorMsg") - # errors will hold the error message - # it can be string or list - # by isinstance method we can detect that - # and handle the case for strings as normal procedure - # and if its list we can iterate the errors - if isinstance(errors, str): - # Checks if the error message is in the HTML - # if error is present we will set flag to False - if errors in r.text: - error_flag = False - else: - # If it's list, it will iterate all the error message - for error in errors: - if error in r.text: - error_flag = False - break - if error_flag: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE - elif error_type == "status_code": - error_codes = net_info.get("errorCode") - query_status = QueryStatus.CLAIMED - - # Type consistency, allowing for both singlets and lists in manifest - if isinstance(error_codes, int): - error_codes = [error_codes] - - if error_codes is not None and r.status_code in error_codes: - query_status = QueryStatus.AVAILABLE - elif r.status_code >= 300 or r.status_code < 200: - query_status = QueryStatus.AVAILABLE - elif error_type == "response_url": - # For this detection method, we have turned off the redirect. - # So, there is no need to check the response URL: it will always - # match the request. Instead, we will ensure that the response - # code indicates that the request was successful (i.e. no 404, or - # forward to some odd redirect). - if 200 <= r.status_code < 300: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE else: - # It should be impossible to ever get here... - raise ValueError( - f"Unknown Error Type '{error_type}' for " f"site '{social_network}'" - ) + if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type): + # It should be impossible to ever get here... + raise ValueError( + f"Unknown Error Type '{error_type}' for " + f"site '{social_network}'" + ) + + if "message" in error_type: + # error_flag True denotes no error found in the HTML + # error_flag False denotes error found in the HTML + error_flag = True + errors = net_info.get("errorMsg") + # errors will hold the error message + # it can be string or list + # by isinstance method we can detect that + # and handle the case for strings as normal procedure + # and if its list we can iterate the errors + if isinstance(errors, str): + # Checks if the error message is in the HTML + # if error is present we will set flag to False + if errors in r.text: + error_flag = False + else: + # If it's list, it will iterate all the error message + for error in errors: + if error in r.text: + error_flag = False + break + if error_flag: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE + + if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE: + error_codes = net_info.get("errorCode") + query_status = QueryStatus.CLAIMED + + # Type consistency, allowing for both singlets and lists in manifest + if isinstance(error_codes, int): + error_codes = [error_codes] + + if error_codes is not None and r.status_code in error_codes: + query_status = QueryStatus.AVAILABLE + elif r.status_code >= 300 or r.status_code < 200: + query_status = QueryStatus.AVAILABLE + + if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE: + # For this detection method, we have turned off the redirect. + # So, there is no need to check the response URL: it will always + # match the request. Instead, we will ensure that the response + # code indicates that the request was successful (i.e. no 404, or + # forward to some odd redirect). + if 200 <= r.status_code < 300: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE if dump_response: print("+++++++++++++++++++++") From 4d00884d8c9689bce722ef64fb5e0a5bb4238f8c Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Sun, 5 Oct 2025 03:00:21 +0530 Subject: [PATCH 28/33] fix(ci): Implement secure diff logic per feedback --- .../workflows/validate_modified_targets.yml | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/.github/workflows/validate_modified_targets.yml b/.github/workflows/validate_modified_targets.yml index 3e5fde31..4738ae2b 100644 --- a/.github/workflows/validate_modified_targets.yml +++ b/.github/workflows/validate_modified_targets.yml @@ -14,13 +14,11 @@ jobs: contents: read pull-requests: write steps: - - name: Checkout PR branch + - name: Checkout repository uses: actions/checkout@v5 with: - # Check out the actual PR code, not the base branch - ref: ${{ github.event.pull_request.head.sha }} - # Fetch all history so we can find the common ancestor (merge-base) - fetch-depth: 0 + # This is the original, secure checkout of the base branch. + ref: ${{ github.base_ref }} - name: Set up Python uses: actions/setup-python@v6 @@ -38,17 +36,21 @@ jobs: - name: Prepare JSON versions for comparison run: | - # Fetch the target branch to ensure we can compare against it - git fetch origin ${{ github.base_ref }} + # Fetch the PR's branch head and give it a local name 'pr' + git fetch origin pull/${{ github.event.pull_request.number }}/head:pr - # Find the exact commit where this branch split from the target branch - MERGE_BASE=$(git merge-base origin/${{ github.base_ref }} HEAD) - echo "Comparing HEAD against merge-base commit: $MERGE_BASE" + # The initial checkout may be shallow. To find a merge-base, + # we need more history. We can 'unshallow' the repository if needed. + git fetch --unshallow || true - # Copy the version of the file from the current PR branch (HEAD) - cp sherlock_project/resources/data.json data.json.head + # Find the merge-base commit between the target branch (master) and the PR branch (pr) + MERGE_BASE=$(git merge-base origin/${{ github.base_ref }} pr) + echo "Comparing PR head against merge-base commit: $MERGE_BASE" - # Extract the version of the file from the merge-base commit + # Safely extract the version of the file from the PR's head without checking it out + git show pr:sherlock_project/resources/data.json > data.json.head + + # Safely extract the version of the file from the merge-base commit git show $MERGE_BASE:sherlock_project/resources/data.json > data.json.base - name: Discover modified targets @@ -57,8 +59,16 @@ jobs: CHANGED=$( python - <<'EOF' import json - with open("data.json.base") as f: base = json.load(f) - with open("data.json.head") as f: head = json.load(f) + import sys + try: + with open("data.json.base") as f: base = json.load(f) + with open("data.json.head") as f: head = json.load(f) + except FileNotFoundError as e: + print(f"Error: Could not find {e.filename}", file=sys.stderr) + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Could not decode JSON from a file - {e}", file=sys.stderr) + sys.exit(1) changed = [] for k, v in head.items(): From 52cd5fdfc136340b2c88ffe8c1dc953ff8b51cc5 Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Sat, 4 Oct 2025 20:22:34 -0400 Subject: [PATCH 29/33] feat: gracefully skip sites with invalid errorType --- sherlock_project/sherlock.py | 97 +++++++++++++++++------------------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index dcfbda04..d349c12b 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -429,61 +429,58 @@ def sherlock( else: if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type): - # It should be impossible to ever get here... - raise ValueError( - f"Unknown Error Type '{error_type}' for " - f"site '{social_network}'" - ) - - if "message" in error_type: - # error_flag True denotes no error found in the HTML - # error_flag False denotes error found in the HTML - error_flag = True - errors = net_info.get("errorMsg") - # errors will hold the error message - # it can be string or list - # by isinstance method we can detect that - # and handle the case for strings as normal procedure - # and if its list we can iterate the errors - if isinstance(errors, str): - # Checks if the error message is in the HTML - # if error is present we will set flag to False - if errors in r.text: - error_flag = False - else: - # If it's list, it will iterate all the error message - for error in errors: - if error in r.text: + error_context = f"Unknown error type '{error_type}' for {social_network}" + query_status = QueryStatus.UNKNOWN + else: + if "message" in error_type: + # error_flag True denotes no error found in the HTML + # error_flag False denotes error found in the HTML + error_flag = True + errors = net_info.get("errorMsg") + # errors will hold the error message + # it can be string or list + # by isinstance method we can detect that + # and handle the case for strings as normal procedure + # and if its list we can iterate the errors + if isinstance(errors, str): + # Checks if the error message is in the HTML + # if error is present we will set flag to False + if errors in r.text: error_flag = False - break - if error_flag: + else: + # If it's list, it will iterate all the error message + for error in errors: + if error in r.text: + error_flag = False + break + if error_flag: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE + + if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE: + error_codes = net_info.get("errorCode") query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE - if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE: - error_codes = net_info.get("errorCode") - query_status = QueryStatus.CLAIMED + # Type consistency, allowing for both singlets and lists in manifest + if isinstance(error_codes, int): + error_codes = [error_codes] - # Type consistency, allowing for both singlets and lists in manifest - if isinstance(error_codes, int): - error_codes = [error_codes] + if error_codes is not None and r.status_code in error_codes: + query_status = QueryStatus.AVAILABLE + elif r.status_code >= 300 or r.status_code < 200: + query_status = QueryStatus.AVAILABLE - if error_codes is not None and r.status_code in error_codes: - query_status = QueryStatus.AVAILABLE - elif r.status_code >= 300 or r.status_code < 200: - query_status = QueryStatus.AVAILABLE - - if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE: - # For this detection method, we have turned off the redirect. - # So, there is no need to check the response URL: it will always - # match the request. Instead, we will ensure that the response - # code indicates that the request was successful (i.e. no 404, or - # forward to some odd redirect). - if 200 <= r.status_code < 300: - query_status = QueryStatus.CLAIMED - else: - query_status = QueryStatus.AVAILABLE + if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE: + # For this detection method, we have turned off the redirect. + # So, there is no need to check the response URL: it will always + # match the request. Instead, we will ensure that the response + # code indicates that the request was successful (i.e. no 404, or + # forward to some odd redirect). + if 200 <= r.status_code < 300: + query_status = QueryStatus.CLAIMED + else: + query_status = QueryStatus.AVAILABLE if dump_response: print("+++++++++++++++++++++") From 4246a7b16fb399967d766aac9d677c7d48b60aa5 Mon Sep 17 00:00:00 2001 From: Paul Pfeister Date: Sat, 4 Oct 2025 20:32:16 -0400 Subject: [PATCH 30/33] chore: make default --no-txt Workflows where a txt file is still required should use --txt --- sherlock_project/sherlock.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index d349c12b..07b19af7 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -723,12 +723,22 @@ def main(): help="Include checking of NSFW sites from default list.", ) + # TODO deprecated in favor of --txt, retained for workflow compatibility, to be removed + # in future release parser.add_argument( "--no-txt", action="store_true", dest="no_txt", default=False, - help="Disable creation of a txt file", + help="Disable creation of a txt file - WILL BE DEPRECATED", + ) + + parser.add_argument( + "--txt", + action="store_true", + dest="output_txt", + default=False, + help="Enable creation of a txt file", ) parser.add_argument( @@ -892,7 +902,7 @@ def main(): else: result_file = f"{username}.txt" - if not args.no_txt: + if args.output_txt: with open(result_file, "w", encoding="utf-8") as file: exists_counter = 0 for website_name in results: From 70e3c0ddd8fd162d162bdace19c296da96be861b Mon Sep 17 00:00:00 2001 From: shreyasNaik0101 Date: Sun, 5 Oct 2025 11:00:14 +0530 Subject: [PATCH 31/33] fix(ci): Address review feedback for correctness and efficiency --- .../workflows/validate_modified_targets.yml | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/validate_modified_targets.yml b/.github/workflows/validate_modified_targets.yml index 4738ae2b..bb244511 100644 --- a/.github/workflows/validate_modified_targets.yml +++ b/.github/workflows/validate_modified_targets.yml @@ -17,8 +17,9 @@ jobs: - name: Checkout repository uses: actions/checkout@v5 with: - # This is the original, secure checkout of the base branch. + # Checkout the base branch but fetch all history to avoid a second fetch call ref: ${{ github.base_ref }} + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v6 @@ -36,23 +37,21 @@ jobs: - name: Prepare JSON versions for comparison run: | - # Fetch the PR's branch head and give it a local name 'pr' + # Fetch only the PR's branch head (single network call in this step) git fetch origin pull/${{ github.event.pull_request.number }}/head:pr - # The initial checkout may be shallow. To find a merge-base, - # we need more history. We can 'unshallow' the repository if needed. - git fetch --unshallow || true - - # Find the merge-base commit between the target branch (master) and the PR branch (pr) + # Find the merge-base commit between the target branch and the PR branch MERGE_BASE=$(git merge-base origin/${{ github.base_ref }} pr) echo "Comparing PR head against merge-base commit: $MERGE_BASE" - # Safely extract the version of the file from the PR's head without checking it out + # Safely extract the file from the PR's head and the merge-base commit git show pr:sherlock_project/resources/data.json > data.json.head - - # Safely extract the version of the file from the merge-base commit git show $MERGE_BASE:sherlock_project/resources/data.json > data.json.base + # CRITICAL FIX: Overwrite the checked-out data.json with the one from the PR + # This ensures that pytest runs against the new, updated file. + cp data.json.head sherlock_project/resources/data.json + - name: Discover modified targets id: discover-modified run: | @@ -83,6 +82,8 @@ jobs: echo -e ">>> Changed targets: \n$(echo $CHANGED | tr ',' '\n')" echo "changed_targets=$CHANGED" >> "$GITHUB_OUTPUT" + # --- The rest of the steps below are unchanged --- + - name: Validate modified targets if: steps.discover-modified.outputs.changed_targets != '' continue-on-error: true From 9e3448d9923fecec7504ef67cc5d0f0892494dcb Mon Sep 17 00:00:00 2001 From: dollaransh17 Date: Sun, 5 Oct 2025 11:59:41 +0530 Subject: [PATCH 32/33] fix(sites): So , Implemented BoardGameGeek using username validation API - Added BoardGameGeek back using the new API endpoint suggested by @ppfeister - Uses https://api.geekdo.com/api/accounts/validate/username?username={} for detection - errorMsg checks for '"isValid":true' to detect valid usernames - This approach avoids the previous issues with: * HTML parsing returning false positives * User API returning JSON with '[]' substrings that caused detection problems - Successfully tested with both valid (blue) and invalid usernames Thanks @ppfeister for the API suggestion and @akh7177 for the initial guidance --- sherlock_project/resources/data.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 891b6245..6c09c39c 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -291,6 +291,14 @@ "urlMain": "https://www.bookcrossing.com/", "username_claimed": "blue" }, + "BoardGameGeek": { + "errorMsg": "\"isValid\":true", + "errorType": "message", + "url": "https://boardgamegeek.com/user/{}", + "urlMain": "https://boardgamegeek.com/", + "urlProbe": "https://api.geekdo.com/api/accounts/validate/username?username={}", + "username_claimed": "blue" + }, "BraveCommunity": { "errorType": "status_code", "url": "https://community.brave.com/u/{}/", From f0510a169ac3960171841d240dc52de7fe406b02 Mon Sep 17 00:00:00 2001 From: Abhyuday K Hegde <66260177+akh7177@users.noreply.github.com> Date: Sun, 5 Oct 2025 15:52:56 +0530 Subject: [PATCH 33/33] Add support for WakaTime --- sherlock_project/resources/data.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sherlock_project/resources/data.json b/sherlock_project/resources/data.json index 7837e5f6..9c160223 100644 --- a/sherlock_project/resources/data.json +++ b/sherlock_project/resources/data.json @@ -2320,6 +2320,12 @@ "urlMain": "https://discourse.wicg.io/", "username_claimed": "stefano" }, + "Wakatime": { + "errorType": "status_code", + "url": "https://wakatime.com/@{}", + "urlMain": "https://wakatime.com/", + "username_claimed": "blue" + }, "Warrior Forum": { "errorType": "status_code", "url": "https://www.warriorforum.com/members/{}.html",