206 lines
8.1 KiB
Python
206 lines
8.1 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
strip_page.py - HTML 脱脂脚本 v2
|
|||
|
|
修复:全面处理所有格式的 base64 图片(无引号/双引号/单引号/url()/style属性)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import re
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
STRIP_THRESHOLD_STYLE_CHARS = 50000
|
|||
|
|
BASE64_PLACEHOLDER = 'data:image/png;base64,iVBORw0KGgo='
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_css_variables(style_text):
|
|||
|
|
vars_found = []
|
|||
|
|
for m in re.finditer(r'(--[\w-]+)\s*:\s*([^;{]+?)(?:\s*!important)?;', style_text):
|
|||
|
|
vars_found.append(f' {m.group(1)}: {m.group(2).strip()};')
|
|||
|
|
return vars_found
|
|||
|
|
|
|||
|
|
|
|||
|
|
def strip_html(input_path, output_dir=None):
|
|||
|
|
input_path = Path(input_path)
|
|||
|
|
if output_dir is None:
|
|||
|
|
output_dir = input_path.parent
|
|||
|
|
output_dir = Path(output_dir)
|
|||
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|||
|
|
|
|||
|
|
stem = input_path.stem
|
|||
|
|
html_out = output_dir / f'{stem}.stripped.html'
|
|||
|
|
css_out = output_dir / f'{stem}.styles.css'
|
|||
|
|
vars_out = output_dir / f'{stem}.variables.css'
|
|||
|
|
|
|||
|
|
with open(input_path, 'r', encoding='utf-8', errors='replace') as f:
|
|||
|
|
html = f.read()
|
|||
|
|
|
|||
|
|
original_size = len(html)
|
|||
|
|
stats = {
|
|||
|
|
'base64_replaced': 0,
|
|||
|
|
'base64_saved_chars': 0,
|
|||
|
|
'styles_extracted': 0,
|
|||
|
|
'styles_saved_chars': 0,
|
|||
|
|
'css_vars_extracted': 0,
|
|||
|
|
'scripts_removed': 0,
|
|||
|
|
'data_hrefs_fixed': 0,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── 0. 扫描所有 CSS 变量(先于任何替换)────────────
|
|||
|
|
all_vars = []
|
|||
|
|
for m in re.finditer(r'<style[^>]*>(.*?)</style>', html, flags=re.DOTALL):
|
|||
|
|
all_vars.extend(extract_css_variables(m.group(1)))
|
|||
|
|
# 也从 style 属性中提取
|
|||
|
|
for m in re.finditer(r'style="([^"]+)"', html):
|
|||
|
|
all_vars.extend(extract_css_variables(m.group(1)))
|
|||
|
|
|
|||
|
|
# ── 1. 全面替换所有 base64 图片 ─────────────────
|
|||
|
|
# 格式1: url(data:image/...) 无引号
|
|||
|
|
def repl_url_nq(m):
|
|||
|
|
stats['base64_replaced'] += 1
|
|||
|
|
stats['base64_saved_chars'] += len(m.group(1))
|
|||
|
|
return f'url("{BASE64_PLACEHOLDER}")'
|
|||
|
|
n = len(re.findall(r'url\((data:image[^\s\)]+)\)', html))
|
|||
|
|
html = re.sub(r'url\((data:image[^\s\)]+)\)', repl_url_nq, html)
|
|||
|
|
|
|||
|
|
# 格式2: url("data:image/...") 双引号
|
|||
|
|
def repl_url_dq(m):
|
|||
|
|
stats['base64_replaced'] += 1
|
|||
|
|
stats['base64_saved_chars'] += len(m.group(1))
|
|||
|
|
return f'url("{BASE64_PLACEHOLDER}")'
|
|||
|
|
html = re.sub(r'url\("(data:image[^"]+)"\)', repl_url_dq, html)
|
|||
|
|
|
|||
|
|
# 格式3: url('data:image/...') 单引号
|
|||
|
|
def repl_url_sq(m):
|
|||
|
|
stats['base64_replaced'] += 1
|
|||
|
|
stats['base64_saved_chars'] += len(m.group(1))
|
|||
|
|
return f"url('{BASE64_PLACEHOLDER}')"
|
|||
|
|
html = re.sub(r"url\('(data:image[^']+)'\)", repl_url_sq, html)
|
|||
|
|
|
|||
|
|
# 格式4: style="background-image:url(data:...)" style属性中无引号
|
|||
|
|
def repl_style_nq(m):
|
|||
|
|
prefix = m.group(1)
|
|||
|
|
data = m.group(2)
|
|||
|
|
stats['base64_replaced'] += 1
|
|||
|
|
stats['base64_saved_chars'] += len(data)
|
|||
|
|
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
|
|||
|
|
html = re.sub(r'(background-image\s*:\s*)url\((data:image[^\s\)]+)\)', repl_style_nq, html, flags=re.IGNORECASE)
|
|||
|
|
|
|||
|
|
# 格式5: style="background-image:url("data:...")" style属性中有引号
|
|||
|
|
def repl_style_dq(m):
|
|||
|
|
prefix = m.group(1)
|
|||
|
|
data = m.group(2)
|
|||
|
|
stats['base64_replaced'] += 1
|
|||
|
|
stats['base64_saved_chars'] += len(data)
|
|||
|
|
return f'{prefix}url("{BASE64_PLACEHOLDER}")'
|
|||
|
|
html = re.sub(r'(background-image\s*:\s*)url\("(data:image[^"]+)"\)', repl_style_dq, html, flags=re.IGNORECASE)
|
|||
|
|
|
|||
|
|
# 格式6: <img src="data:image/...">
|
|||
|
|
def repl_img(m):
|
|||
|
|
attrs = m.group(1)
|
|||
|
|
stats['base64_replaced'] += 1
|
|||
|
|
stats['base64_saved_chars'] += len(m.group(2))
|
|||
|
|
return f'<img{attrs}src="{BASE64_PLACEHOLDER}"'
|
|||
|
|
html = re.sub(r'<img([^>]*?)src="(data:image/[^"]+)"', repl_img, html, flags=re.IGNORECASE)
|
|||
|
|
|
|||
|
|
# ── 2. 处理极大的内联 style ──────────────────────
|
|||
|
|
extracted_styles = []
|
|||
|
|
|
|||
|
|
def process_style(m):
|
|||
|
|
tag = m.group(1)
|
|||
|
|
attrs = m.group(2)
|
|||
|
|
content = m.group(3)
|
|||
|
|
|
|||
|
|
if len(content) < STRIP_THRESHOLD_STYLE_CHARS:
|
|||
|
|
return m.group(0)
|
|||
|
|
|
|||
|
|
stats['styles_extracted'] += 1
|
|||
|
|
stats['styles_saved_chars'] += len(content)
|
|||
|
|
all_vars.extend(extract_css_variables(content))
|
|||
|
|
|
|||
|
|
style_id = f'extracted-style-{stats["styles_extracted"]}'
|
|||
|
|
html_tag = f'<link rel="stylesheet" id="{style_id}" href="{stem}.styles.css">'
|
|||
|
|
extracted_styles.append((style_id, content))
|
|||
|
|
return f'<!-- {style_id} moved to external CSS -->'
|
|||
|
|
|
|||
|
|
html = re.sub(r'(<style)([^>]*?)>(.*?)</style>', process_style, html, flags=re.DOTALL)
|
|||
|
|
|
|||
|
|
# ── 3. 写外置 CSS ────────────────────────────────
|
|||
|
|
if extracted_styles:
|
|||
|
|
css_parts = ['/* === Extracted inline styles === */', '']
|
|||
|
|
for style_id, content in extracted_styles:
|
|||
|
|
css_parts.append(f'/* --- {style_id} --- */')
|
|||
|
|
css_parts.append(content)
|
|||
|
|
css_parts.append('')
|
|||
|
|
css_out.write_text('\n'.join(css_parts), encoding='utf-8')
|
|||
|
|
|
|||
|
|
# ── 4. 写变量文件 ────────────────────────────────
|
|||
|
|
if all_vars:
|
|||
|
|
seen = set()
|
|||
|
|
unique_vars = []
|
|||
|
|
for v in all_vars:
|
|||
|
|
key = v.strip().split(':')[0]
|
|||
|
|
if key not in seen and key.startswith('--'):
|
|||
|
|
seen.add(key)
|
|||
|
|
unique_vars.append(v)
|
|||
|
|
vars_out.write_text(
|
|||
|
|
'/* === Extracted CSS Variables === */\n:root {\n' +
|
|||
|
|
'\n'.join(unique_vars) +
|
|||
|
|
'\n}\n',
|
|||
|
|
encoding='utf-8'
|
|||
|
|
)
|
|||
|
|
stats['css_vars_extracted'] = len(unique_vars)
|
|||
|
|
|
|||
|
|
# ── 5. 移除 inline script ────────────────────────
|
|||
|
|
stats['scripts_removed'] = len(re.findall(r'<script[^>]*>.*?</script>', html, flags=re.DOTALL))
|
|||
|
|
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
|
|||
|
|
html = re.sub(r'<script[^>]*/>\s*', '', html)
|
|||
|
|
|
|||
|
|
# ── 6. 清理 data: href ──────────────────────────
|
|||
|
|
stats['data_hrefs_fixed'] = len(re.findall(r'href="data:[^"]+"', html))
|
|||
|
|
html = re.sub(r'href="(data:[^"]+)"', 'href="#data-link-removed"', html)
|
|||
|
|
|
|||
|
|
# ── 7. 写脱脂 HTML ──────────────────────────────
|
|||
|
|
html = re.sub(r'<!--\s*-->', '', html)
|
|||
|
|
|
|||
|
|
if extracted_styles and '<link rel="stylesheet"' not in html:
|
|||
|
|
html = html.replace('<head>', f'<head>\n <link rel="stylesheet" href="{stem}.styles.css">', 1)
|
|||
|
|
|
|||
|
|
html_out.write_text(html, encoding='utf-8')
|
|||
|
|
|
|||
|
|
new_html_size = len(html)
|
|||
|
|
css_size = css_out.stat().st_size if css_out.exists() else 0
|
|||
|
|
vars_size = vars_out.stat().st_size if vars_out.exists() else 0
|
|||
|
|
saved = original_size - new_html_size - css_size
|
|||
|
|
|
|||
|
|
print(f'''
|
|||
|
|
✅ 脱脂完成!
|
|||
|
|
{'='*54}
|
|||
|
|
📄 输入: {input_path.name}
|
|||
|
|
原始大小: {original_size/1024:.1f} KB ({original_size:,} 字符)
|
|||
|
|
{'='*54}
|
|||
|
|
📊 处理结果:
|
|||
|
|
base64 图片替换: {stats["base64_replaced"]} 个
|
|||
|
|
base64 节省: {stats["base64_saved_chars"]/1024:.1f} KB
|
|||
|
|
style 块提取: {stats["styles_extracted"]} 个
|
|||
|
|
style 节省: {stats["styles_saved_chars"]/1024:.1f} KB
|
|||
|
|
CSS 变量提取: {stats["css_vars_extracted"]} 个
|
|||
|
|
script 移除: {stats["scripts_removed"]} 个
|
|||
|
|
data href 修复: {stats["data_hrefs_fixed"]} 个
|
|||
|
|
{'='*54}
|
|||
|
|
📦 输出文件:
|
|||
|
|
HTML (脱脂): {html_out.name} ({new_html_size/1024:.1f} KB)
|
|||
|
|
CSS (提取样式): {css_out.name} ({css_size/1024:.1f} KB)
|
|||
|
|
CSS (变量): {vars_out.name} ({vars_size/1024:.1f} KB)
|
|||
|
|
{'='*54}
|
|||
|
|
💾 HTML 体积压缩: {new_html_size/1024:.1f} KB (原始 {original_size/1024:.1f} KB)
|
|||
|
|
节省比例: {(saved)/original_size*100:.1f}%
|
|||
|
|
''')
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print('用法: python3 strip_page.py <input.html> [output_dir]')
|
|||
|
|
sys.exit(1)
|
|||
|
|
strip_html(sys.argv[1], sys.argv[2] if len(sys.argv) > 2 else None)
|