Files
hcapEnv/analyze_priority.py
2026-02-21 18:27:49 +08:00

176 lines
6.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
从 chatgpt.com-*_analysis.json 中,按优先级对每个指纹字段评分排序。
评分规则:
+10 bot 自动化检测专属字段webdriver, $cdc_*, callPhantom 等)
+ 5 出现在核心检测循环 tH=154 或 tH=155
+ 2 每额外出现在一个不同 tH跨 tH 频次)
+ 3 属于已知高风险 APICrypto, RTCPeerConnection, OfflineAudioContext 等)
+ 1 属于 navigator / screen / canvas 系列
"""
import json
import sys
import glob
from collections import defaultdict
# ── 配置 ─────────────────────────────────────────────────────
ANALYSIS_JSON = sorted(glob.glob(
"/home/carry/myprj/hcaptcha/asset/chatgpt.com-*_analysis.json"
))[-1]
# bot 自动化检测专属字段(出现即暴露)
BOT_SIGNALS = {
"webdriver", "callPhantom", "callSelenium", "_selenium", "__phantomas",
"domAutomationController", "awesomium", "$wdc_", "domAutomation",
"_WEBDRIVER_ELEM_CACHE", "spawn", "__nightmare", "__webdriver_script_fn",
"__webdriver_script_func", "__driver_evaluate", "__webdriver_evaluate",
"__selenium_evaluate", "__fxdriver_evaluate", "__driver_unwrapped",
"__webdriver_unwrapped", "__selenium_unwrapped", "__fxdriver_unwrapped",
"hcaptchaCallbackZenno", "_Selenium_IDE_Recorder",
"cdc_adoQpoasnfa76pfcZLmcfl_Array",
"cdc_adoQpoasnfa76pfcZLmcfl_Promise",
"cdc_adoQpoasnfa76pfcZLmcfl_Symbol",
"CDCJStestRunStatus",
"$cdc_asdjflasutopfhvcZLmcfl_",
"$chrome_asyncScriptInfo",
}
# 高风险 API指纹强度高
HIGH_RISK_APIS = {
"Crypto", "RTCPeerConnection", "OfflineAudioContext",
"CanvasRenderingContext2D", "HTMLCanvasElement", "WebGL2RenderingContext",
"WebGLRenderingContext", "IDBFactory", "PluginArray", "NavigatorUAData",
"PerformanceNavigationTiming", "PerformanceResourceTiming",
}
# navigator / screen / canvas 系列
MEDIUM_APIS = {
"Navigator", "Screen", "Storage", "Performance", "HTMLDocument",
"ScreenOrientation", "NetworkInformation", "languages", "maxTouchPoints",
"webdriver", "platform", "userAgent",
}
# 核心检测循环 tH
CORE_TH = {154, 155}
# ── 加载 ────────────────────────────────────────────────────
def load(path):
with open(path, encoding="utf-8") as f:
return json.load(f)
# ── 评分 ────────────────────────────────────────────────────
def score(data):
# api -> {tH set, score, reasons}
api_info = defaultdict(lambda: {"tH_set": set(), "score": 0, "reasons": []})
for tH_str, entry in data.items():
tH = int(tH_str)
for ig in entry.get("ig_values", []):
if not isinstance(ig, str):
continue
# 跳过明显是"值"而非 API 名的字符串
if ig.startswith("0,1,2") or ig.startswith("1:") or \
ig.startswith("#") or ig.startswith("return ") or \
ig.startswith("https://") or len(ig) > 80:
continue
info = api_info[ig]
info["tH_set"].add(tH)
# 计算分数
for api, info in api_info.items():
s = 0
reasons = []
# bot 信号
if api in BOT_SIGNALS:
s += 10
reasons.append("🚨 bot检测字段 +10")
# 核心检测循环
core_hit = info["tH_set"] & CORE_TH
if core_hit:
s += 5
reasons.append(f"🎯 核心循环 tH={sorted(core_hit)} +5")
# 高风险 API
if api in HIGH_RISK_APIS:
s += 3
reasons.append("⚡ 高风险API +3")
# 中等 API
if api in MEDIUM_APIS:
s += 1
reasons.append("📡 navigator/screen类 +1")
# 跨 tH 频次(每多一个 tH +2
freq = len(info["tH_set"])
if freq > 1:
bonus = (freq - 1) * 2
s += bonus
reasons.append(f"🔁 跨{freq}个tH +{bonus}")
info["score"] = s
info["reasons"] = reasons
return api_info
# ── 输出 ─────────────────────────────────────────────────────
def report(api_info):
# 按分数排序
ranked = sorted(api_info.items(), key=lambda x: -x[1]["score"])
print("=" * 70)
print(" HSW 指纹字段 优先级排名")
print("=" * 70)
# 分档
tiers = [
("🔴 P0 必须正确≥10分", lambda s: s >= 10),
("🟠 P1 高优先级5~9分", lambda s: 5 <= s < 10),
("🟡 P2 中优先级3~4分", lambda s: 3 <= s < 5),
("🟢 P3 低优先级1~2分", lambda s: 1 <= s < 3),
("⚪ P4 可忽略0分", lambda s: s == 0),
]
for tier_label, condition in tiers:
tier_items = [(api, info) for api, info in ranked if condition(info["score"])]
if not tier_items:
continue
print(f"\n{tier_label} [{len(tier_items)} 个]")
print(f" {'分数':<5} {'字段名':<45} 出现tH")
print(f" {''*5} {''*45} {''*20}")
for api, info in tier_items:
tH_list = ",".join(str(t) for t in sorted(info["tH_set"]))
print(f" {info['score']:<5} {api:<45} tH={tH_list}")
for r in info["reasons"]:
print(f" {r}")
# 导出 JSON
out = {
api: {
"score": info["score"],
"tH_list": sorted(info["tH_set"]),
"reasons": info["reasons"],
}
for api, info in ranked
}
out_path = ANALYSIS_JSON.replace("_analysis.json", "_priority.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
print(f"\n📄 优先级结果已写入: {out_path}")
# ── 入口 ─────────────────────────────────────────────────────
if __name__ == "__main__":
path = sys.argv[1] if len(sys.argv) > 1 else ANALYSIS_JSON
print(f"📂 读取: {path}\n")
data = load(path)
api_info = score(data)
report(api_info)