summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Xu (Hello71) <alex_y_xu@yahoo.ca>2021-10-05 15:22:21 -0400
committerAlex Xu (Hello71) <alex_y_xu@yahoo.ca>2021-10-05 15:22:21 -0400
commit746ab5961c023450f7774d568e052cee44946765 (patch)
treefc7e69f0db209cdfc747e1eff01fbf3a5cf8a784
parent12b4e12865088354c94cdf6d6b6c0534185253bd (diff)
downloadwfs-746ab5961c023450f7774d568e052cee44946765.tar.xz
wfs-746ab5961c023450f7774d568e052cee44946765.zip
get rid of web workers, refactor
-rwxr-xr-xwfs.py259
1 files changed, 95 insertions, 164 deletions
diff --git a/wfs.py b/wfs.py
index 08ce689..e67efbe 100755
--- a/wfs.py
+++ b/wfs.py
@@ -4,211 +4,142 @@ import argparse
import logging
import multiprocessing
import os
-import signal
+import pathlib
import sys
-from itertools import chain
-from multiprocessing import Pool
-from multiprocessing.util import Finalize
-from pathlib import Path
-from urllib.parse import urlparse
-
from fontTools.subset import Options, Subsetter, load_font
from selenium import webdriver
-logging.basicConfig(format='[%(relativeCreated)d] %(message)s')
-logger = logging.getLogger('websubset')
+logging.basicConfig(format='%(levelname)s: %(message)s')
+logger = logging.getLogger('wfs')
logger.setLevel(logging.INFO)
-EXTRACT_SCRIPT = r'''
- let whitelist = new Set(arguments[0]);
- let walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
- let node, dict = {};
- while (node = walker.nextNode()) {
- let cs = getComputedStyle(node.parentNode);
- let css = k => cs.getPropertyValue(k);
- if (css('display') == 'none')
- continue;
- let k = css('font-family').replace(/"/g, '').replace(/,.*/, '') + ';' +
- css('font-weight') + ';' +
- css('font-style');
- if (!whitelist.has(k))
- continue;
- if (!(k in dict))
- dict[k] = '';
- dict[k] += node.nodeValue;
- }
- return dict;
-'''
-
-def gen_font_face(font):
- if 'fontfile' not in font:
- return ''
- return ''.join([
- '@font-face{',
- 'font-family:"', font['family'], '";',
- 'font-weight:', font['weight'], ';',
- 'font-style:', font['style'], ';',
- 'src: url("', font["fontfile"], '");',
- '}'])
-
-DRIVER = None
-def stop_driver():
- global DRIVER
- if DRIVER:
- DRIVER.quit()
- DRIVER = None
-def hook_sig(signum):
- orig_handler = signal.getsignal(signum)
- if orig_handler is None:
- raise Exception('{signum} handler is None')
- def term_handler(*_):
- stop_driver()
- signal.signal(signum, orig_handler)
- os.kill(os.getpid(), signum)
- signal.signal(signum, term_handler)
-def start_wworker(driver_name):
- # clamp selenium wasteful sleeps to 0.1s
- import time
- from time import sleep
- time.sleep = lambda secs: sleep(min(secs, 0.1))
+def make_uri(path):
+ if ':' in path:
+ return path
+ return pathlib.Path(path).resolve().as_uri()
- hook_sig(signal.SIGTERM)
- global DRIVER
+def start_driver(driver_name):
if driver_name == 'chrome':
chrome_options = webdriver.chrome.options.Options()
chrome_options.headless = True
chrome_options.experimental_options["prefs"] = {
"profile.default_content_setting_values.images": 2
}
- DRIVER = webdriver.Chrome(options=chrome_options, desired_capabilities={'detach': True})
- elif driver_name == 'firefox':
+ return webdriver.Chrome(options=chrome_options)
+ if driver_name == 'firefox':
firefox_profile = webdriver.FirefoxProfile()
firefox_profile.set_preference('permissions.default.image', 2)
firefox_options = webdriver.firefox.options.Options()
firefox_options.headless = True
- DRIVER = webdriver.Firefox(firefox_profile=firefox_profile, options=firefox_options)
- else:
- raise Exception('unknown driver name')
- Finalize(DRIVER, stop_driver, exitpriority=16)
-
-def is_uri(path):
- parsed = urlparse(path)
- return parsed.scheme and parsed.netloc
-def make_uri(path):
- if is_uri(path):
- return path
- else:
- return Path(path).resolve().as_uri()
-
-def extract(path, fonts, screenshots):
- logger.info('fetching %s', path)
- DRIVER.get(make_uri(path))
- if screenshots:
- logger.info('replacing fonts for %s', path)
- height = DRIVER.execute_script(''.join([
- "let style = document.createElement('style'); style.innerHTML = '",
- ''.join(gen_font_face(font) for font in fonts),
- "'; document.body.appendChild(style); return document.documentElement.scrollHeight"]))
- logger.info('taking pre-screenshot for %s', path)
- DRIVER.set_window_size(1920, height)
- screenshot = DRIVER.get_screenshot_as_png()
- else:
- screenshot = None
- logger.info('extracting text from %s', path)
- whitelist = [';'.join((f['family'], f['weight'], f['style'])) for f in fonts]
- return (path, DRIVER.execute_script(EXTRACT_SCRIPT, whitelist), screenshot)
+ return webdriver.Firefox(firefox_profile=firefox_profile, options=firefox_options)
+ raise Exception('unknown driver name')
-def get_fontdesc(fonts, fontspec):
- font_match = dict(zip(('family', 'weight', 'style'), fontspec.split(';')))
- for font in fonts:
- if font_match.items() <= font.items():
- return font
- return None
-
-def subset(fontdesc, text, fts_opts):
- fontfile = fontdesc['fontfile']
+def subset(fontfile, text, fts_opts):
logger.info('subsetting %s', fontfile)
font = load_font(fontfile, fts_opts, dontLoadGlyphNames=True)
subsetter = Subsetter(options=fts_opts)
subsetter.populate(text=text)
subsetter.subset(font)
- ret = []
- outfile = fontfile[:fontfile.rindex('.')] + '.subset.woff2'
- ret.append((font, 'woff2', outfile))
- return ret
-
-def write_subset(font, flavor, outfile):
+ font.flavor = 'woff2'
+ outfile = fontfile[:fontfile.rindex('.')] + '.subset.' + font.flavor
logger.info('writing %s', outfile)
- font.flavor = flavor
font.save(outfile)
-def verify(path, screenshot_begin_png):
- logger.info('refetching %s', path)
- DRIVER.get(make_uri(path))
-
- from io import BytesIO
- from PIL import Image, ImageChops
- screenshot_begin = Image.open(BytesIO(screenshot_begin_png), formats=('PNG',)).convert('RGB')
-
- logger.info('taking post-screenshot for %s', path)
- DRIVER.set_window_size(*screenshot_begin.size)
- screenshot_end_png = DRIVER.get_screenshot_as_png()
-
- screenshot_end = Image.open(BytesIO(screenshot_end_png), formats=('PNG',)).convert('RGB')
- logger.info('checking %s screenshots for %s', 'x'.join(map(str, screenshot_end.size)), path)
- if ImageChops.difference(screenshot_begin, screenshot_end).getbbox():
- raise Exception(f'screenshots do not match for {path}')
-
def main(argv):
- parser = argparse.ArgumentParser(description='Web Font Subsetter', epilog='see pyftsubset --help for additional options')
- parser.add_argument('--driver', help='selenium driver name (chrome or firefox)', default='chrome')
+ parser = argparse.ArgumentParser(description='Web Font Subsetter',
+ epilog='see pyftsubset --help for additional options')
+ parser.add_argument('--driver',
+ help='selenium driver name (chrome or firefox)', default='chrome')
parser.add_argument('--no-screenshots', help='skip screenshot validation', action='store_true')
parser.add_argument('--font', help='add font (fontfile:family:weight:style)', action='append')
parser.add_argument('file', help='html files', nargs='+')
args, leftover = parser.parse_known_intermixed_args(argv)
options = Options()
files = args.file + options.parse_opts(leftover)
- if any([file[0] == '-' for file in files]):
+ if any(file[0] == '-' for file in files):
parser.print_usage()
raise Exception('bad arguments')
- if options.with_zopfli:
- from fontTools.ttLib import sfnt
- sfnt.USE_ZOPFLI = True
- ncpus = len(os.sched_getaffinity(0))
- fonts = [dict(zip(['fontfile', 'family', 'weight', 'style'], font.split(':'))) for font in args.font]
- nwworkers = min(len(files), ncpus)
- nfworkers = min(len(fonts), ncpus)
- logger.info('using %d web workers, %d font workers', nwworkers, nfworkers)
+ if not args.no_screenshots:
+ from io import BytesIO
+ from PIL import Image, ImageChops
+ fonts = {}
+ for font in args.font:
+ fontlst = font.split(':')
+ fonts[(fontlst[1], fontlst[2] or '400', fontlst[3] or 'normal')] = fontlst[0]
+
+ # clamp selenium wasteful sleeps
+ import time
+ sleep = time.sleep
+ time.sleep = lambda secs: sleep(min(secs, 0.1))
- with Pool(nfworkers) as fpool, \
- Pool(nwworkers, start_wworker, (args.driver,)) as wpool:
- all_font_texts = {}
+ with start_driver(args.driver) as driver:
+ font_texts = {}
screenshots = []
- extract_args = ((file, fonts, not args.no_screenshots) for file in args.file)
- extracted = wpool.starmap(extract, extract_args)
- for path, font_texts, screenshot in extracted:
+ for path in args.file:
+ logger.info('fetching %s', path)
+ driver.get(make_uri(path))
if not args.no_screenshots:
- screenshots.append((path, screenshot))
- for fontspec, text in font_texts.items():
- if fontspec in all_font_texts:
- all_font_texts[fontspec] |= set(text)
+ logger.info('replacing fonts for %s', path)
+ height = driver.execute_script("""
+ let style = document.createElement('style');
+ style.innerHTML = arguments[0];
+ document.body.appendChild(style);
+ return document.documentElement.scrollHeight;
+ """, ''.join(f'''
+ @font-face{{
+ font-family: "{fontdesc[0]}";
+ font-weight: {fontdesc[1]};
+ font-style: {fontdesc[2]};
+ src: url({fontfile});
+ }}''' for fontdesc, fontfile in fonts.items()))
+ logger.info('taking pre-screenshot for %s', path)
+ driver.set_window_size(1920, height)
+ screenshots.append((path, driver.get_screenshot_as_png()))
+ logger.info('extracting text from %s', path)
+ for fontstr, text in driver.execute_script(r'''
+ const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
+ let node, dict = {};
+ while (node = walker.nextNode()) {
+ const cs = getComputedStyle(node.parentNode);
+ const css = k => cs.getPropertyValue(k);
+ if (css('display') == 'none') continue;
+ const k = css('font-family').replace(/"/g, '').replace(/,.*/, '') + ';' +
+ css('font-weight') + ';' + css('font-style');
+ if (k in dict) dict[k] += node.nodeValue;
+ else dict[k] = node.nodeValue;
+ }
+ return dict;
+ ''').items():
+
+ fontspec = tuple(fontstr.split(';'))
+ if fontspec in font_texts:
+ font_texts[fontspec] |= set(text)
else:
- all_font_texts[fontspec] = set(text)
+ font_texts[fontspec] = set(text)
if args.no_screenshots:
- logger.info('shutting down web workers early')
- wpool.close()
- subset_args = (
- (get_fontdesc(fonts, fontspec), ''.join(text), options)
- for fontspec, text in all_font_texts.items())
- subsetted = fpool.starmap(subset, subset_args)
- fpool.starmap(write_subset, chain(*subsetted))
- if not args.no_screenshots:
- wpool.starmap(verify, screenshots, 1)
- if multiprocessing.active_children():
- logger.info('waiting for workers')
- for proc in multiprocessing.active_children():
- proc.join()
+ logger.info('shutting down driver')
+ driver.close()
+ with multiprocessing.Pool(min(len(fonts), len(os.sched_getaffinity(0)))) as fpool:
+ jobs = []
+ for fontspec, text in font_texts.items():
+ try:
+ jobs.append((fonts[fontspec], ''.join(text), options))
+ except KeyError:
+ logger.warning('missing font %s', fontspec)
+ fpool.starmap(subset, jobs)
+ while screenshots:
+ path, start_png = screenshots.pop()
+ start = Image.open(BytesIO(start_png), formats=('PNG',))
+ logger.info('checking %dx%d screenshot for %s', *start.size, path)
+ driver.get(make_uri(path))
+
+ logger.info('taking post-screenshot for %s', path)
+ driver.set_window_size(*start.size)
+
+ end = Image.open(BytesIO(driver.get_screenshot_as_png()), formats=('PNG',))
+ if ImageChops.difference(start.convert('RGB'), end.convert('RGB')).getbbox():
+ raise Exception(f'screenshots do not match for {path}')
logger.info('exiting successfully')
if __name__ == '__main__':