get rid of web workers, refactor

author: Alex Xu (Hello71) <alex_y_xu@yahoo.ca> 2021-10-05 15:22:21 -0400
committer: Alex Xu (Hello71) <alex_y_xu@yahoo.ca> 2021-10-05 15:22:21 -0400
commit: 746ab5961c023450f7774d568e052cee44946765 (patch)
tree: fc7e69f0db209cdfc747e1eff01fbf3a5cf8a784
parent: 12b4e12865088354c94cdf6d6b6c0534185253bd (diff)
download: wfs-746ab5961c023450f7774d568e052cee44946765.tar.xz
wfs-746ab5961c023450f7774d568e052cee44946765.zip
1 files changed, 95 insertions, 164 deletions
diff --git a/wfs.py b/wfs.py
index 08ce689..e67efbe 100755
--- a/wfs.py
+++ b/wfs.py
@@ -4,211 +4,142 @@ import argparse
 import logging
 import multiprocessing
 import os
-import signal
+import pathlib
 import sys
 
-from itertools import chain
-from multiprocessing import Pool
-from multiprocessing.util import Finalize
-from pathlib import Path
-from urllib.parse import urlparse
-
 from fontTools.subset import Options, Subsetter, load_font
 from selenium import webdriver
 
-logging.basicConfig(format='[%(relativeCreated)d] %(message)s')
-logger = logging.getLogger('websubset')
+logging.basicConfig(format='%(levelname)s: %(message)s')
+logger = logging.getLogger('wfs')
 logger.setLevel(logging.INFO)
 
-EXTRACT_SCRIPT = r'''
-    let whitelist = new Set(arguments[0]);
-    let walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
-    let node, dict = {};
-    while (node = walker.nextNode()) {
-        let cs = getComputedStyle(node.parentNode);
-        let css = k => cs.getPropertyValue(k);
-        if (css('display') == 'none')
-            continue;
-        let k = css('font-family').replace(/"/g, '').replace(/,.*/, '') + ';' +
-                css('font-weight') + ';' +
-                css('font-style');
-        if (!whitelist.has(k))
-            continue;
-        if (!(k in dict))
-            dict[k] = '';
-        dict[k] += node.nodeValue;
-    }
-    return dict;
-'''
-
-def gen_font_face(font):
-    if 'fontfile' not in font:
-        return ''
-    return ''.join([
-        '@font-face{',
-        'font-family:"', font['family'], '";',
-        'font-weight:', font['weight'], ';',
-        'font-style:', font['style'], ';',
-        'src: url("', font["fontfile"], '");',
-        '}'])
-
-DRIVER = None
-def stop_driver():
-    global DRIVER
-    if DRIVER:
-        DRIVER.quit()
-        DRIVER = None
-def hook_sig(signum):
-    orig_handler = signal.getsignal(signum)
-    if orig_handler is None:
-        raise Exception('{signum} handler is None')
-    def term_handler(*_):
-        stop_driver()
-        signal.signal(signum, orig_handler)
-        os.kill(os.getpid(), signum)
-    signal.signal(signum, term_handler)
-def start_wworker(driver_name):
-    # clamp selenium wasteful sleeps to 0.1s
-    import time
-    from time import sleep
-    time.sleep = lambda secs: sleep(min(secs, 0.1))
+def make_uri(path):
+    if ':' in path:
+        return path
+    return pathlib.Path(path).resolve().as_uri()
 
-    hook_sig(signal.SIGTERM)
-    global DRIVER
+def start_driver(driver_name):
     if driver_name == 'chrome':
         chrome_options = webdriver.chrome.options.Options()
         chrome_options.headless = True
         chrome_options.experimental_options["prefs"] = {
             "profile.default_content_setting_values.images": 2
         }
-        DRIVER = webdriver.Chrome(options=chrome_options, desired_capabilities={'detach': True})
-    elif driver_name == 'firefox':
+        return webdriver.Chrome(options=chrome_options)
+    if driver_name == 'firefox':
         firefox_profile = webdriver.FirefoxProfile()
         firefox_profile.set_preference('permissions.default.image', 2)
         firefox_options = webdriver.firefox.options.Options()
         firefox_options.headless = True
-        DRIVER = webdriver.Firefox(firefox_profile=firefox_profile, options=firefox_options)
-    else:
-        raise Exception('unknown driver name')
-    Finalize(DRIVER, stop_driver, exitpriority=16)
+        return webdriver.Firefox(firefox_profile=firefox_profile, options=firefox_options)
+    raise Exception('unknown driver name')
 
-def is_uri(path):
-    parsed = urlparse(path)
-    return parsed.scheme and parsed.netloc
-def make_uri(path):
-    if is_uri(path):
-        return path
-    else:
-        return Path(path).resolve().as_uri()
-
-def extract(path, fonts, screenshots):
-    logger.info('fetching %s', path)
-    DRIVER.get(make_uri(path))
-    if screenshots:
-        logger.info('replacing fonts for %s', path)
-        height = DRIVER.execute_script(''.join([
-            "let style = document.createElement('style'); style.innerHTML = '",
-            ''.join(gen_font_face(font) for font in fonts),
-            "'; document.body.appendChild(style); return document.documentElement.scrollHeight"]))
-        logger.info('taking pre-screenshot for %s', path)
-        DRIVER.set_window_size(1920, height)
-        screenshot = DRIVER.get_screenshot_as_png()
-    else:
-        screenshot = None
-    logger.info('extracting text from %s', path)
-    whitelist = [';'.join((f['family'], f['weight'], f['style'])) for f in fonts]
-    return (path, DRIVER.execute_script(EXTRACT_SCRIPT, whitelist), screenshot)
-
-def get_fontdesc(fonts, fontspec):
-    font_match = dict(zip(('family', 'weight', 'style'), fontspec.split(';')))
-    for font in fonts:
-        if font_match.items() <= font.items():
-            return font
-    return None
-
-def subset(fontdesc, text, fts_opts):
-    fontfile = fontdesc['fontfile']
+def subset(fontfile, text, fts_opts):
     logger.info('subsetting %s', fontfile)
     font = load_font(fontfile, fts_opts, dontLoadGlyphNames=True)
     subsetter = Subsetter(options=fts_opts)
     subsetter.populate(text=text)
     subsetter.subset(font)
-    ret = []
-    outfile = fontfile[:fontfile.rindex('.')] + '.subset.woff2'
-    ret.append((font, 'woff2', outfile))
-    return ret
-
-def write_subset(font, flavor, outfile):
+    font.flavor = 'woff2'
+    outfile = fontfile[:fontfile.rindex('.')] + '.subset.' + font.flavor
     logger.info('writing %s', outfile)
-    font.flavor = flavor
     font.save(outfile)
 
-def verify(path, screenshot_begin_png):
-    logger.info('refetching %s', path)
-    DRIVER.get(make_uri(path))
-
-    from io import BytesIO
-    from PIL import Image, ImageChops
-    screenshot_begin = Image.open(BytesIO(screenshot_begin_png), formats=('PNG',)).convert('RGB')
-
-    logger.info('taking post-screenshot for %s', path)
-    DRIVER.set_window_size(*screenshot_begin.size)
-    screenshot_end_png = DRIVER.get_screenshot_as_png()
-
-    screenshot_end = Image.open(BytesIO(screenshot_end_png), formats=('PNG',)).convert('RGB')
-    logger.info('checking %s screenshots for %s', 'x'.join(map(str, screenshot_end.size)), path)
-    if ImageChops.difference(screenshot_begin, screenshot_end).getbbox():
-        raise Exception(f'screenshots do not match for {path}')
-
 def main(argv):
-    parser = argparse.ArgumentParser(description='Web Font Subsetter', epilog='see pyftsubset --help for additional options')
-    parser.add_argument('--driver', help='selenium driver name (chrome or firefox)', default='chrome')
+    parser = argparse.ArgumentParser(description='Web Font Subsetter',
+            epilog='see pyftsubset --help for additional options')
+    parser.add_argument('--driver',
+            help='selenium driver name (chrome or firefox)', default='chrome')
     parser.add_argument('--no-screenshots', help='skip screenshot validation', action='store_true')
     parser.add_argument('--font', help='add font (fontfile:family:weight:style)', action='append')
     parser.add_argument('file', help='html files', nargs='+')
     args, leftover = parser.parse_known_intermixed_args(argv)
     options = Options()
     files = args.file + options.parse_opts(leftover)
-    if any([file[0] == '-' for file in files]):
+    if any(file[0] == '-' for file in files):
         parser.print_usage()
         raise Exception('bad arguments')
-    if options.with_zopfli:
-        from fontTools.ttLib import sfnt
-        sfnt.USE_ZOPFLI = True
-    ncpus = len(os.sched_getaffinity(0))
-    fonts = [dict(zip(['fontfile', 'family', 'weight', 'style'], font.split(':'))) for font in args.font]
-    nwworkers = min(len(files), ncpus)
-    nfworkers = min(len(fonts), ncpus)
-    logger.info('using %d web workers, %d font workers', nwworkers, nfworkers)
+    if not args.no_screenshots:
+        from io import BytesIO
+        from PIL import Image, ImageChops
+    fonts = {}
+    for font in args.font:
+        fontlst = font.split(':')
+        fonts[(fontlst[1], fontlst[2] or '400', fontlst[3] or 'normal')] = fontlst[0]
+
+    # clamp selenium wasteful sleeps
+    import time
+    sleep = time.sleep
+    time.sleep = lambda secs: sleep(min(secs, 0.1))
 
-    with Pool(nfworkers) as fpool, \
-         Pool(nwworkers, start_wworker, (args.driver,)) as wpool:
-        all_font_texts = {}
+    with start_driver(args.driver) as driver:
+        font_texts = {}
         screenshots = []
-        extract_args = ((file, fonts, not args.no_screenshots) for file in args.file)
-        extracted = wpool.starmap(extract, extract_args)
-        for path, font_texts, screenshot in extracted:
+        for path in args.file:
+            logger.info('fetching %s', path)
+            driver.get(make_uri(path))
             if not args.no_screenshots:
-                screenshots.append((path, screenshot))
-            for fontspec, text in font_texts.items():
-                if fontspec in all_font_texts:
-                    all_font_texts[fontspec] |= set(text)
+                logger.info('replacing fonts for %s', path)
+                height = driver.execute_script("""
+                    let style = document.createElement('style');
+                    style.innerHTML = arguments[0];
+                    document.body.appendChild(style);
+                    return document.documentElement.scrollHeight;
+                """, ''.join(f'''
+                    @font-face{{
+                        font-family: "{fontdesc[0]}";
+                        font-weight: {fontdesc[1]};
+                        font-style: {fontdesc[2]};
+                        src: url({fontfile});
+                    }}''' for fontdesc, fontfile in fonts.items()))
+                logger.info('taking pre-screenshot for %s', path)
+                driver.set_window_size(1920, height)
+                screenshots.append((path, driver.get_screenshot_as_png()))
+            logger.info('extracting text from %s', path)
+            for fontstr, text in driver.execute_script(r'''
+                    const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
+                    let node, dict = {};
+                    while (node = walker.nextNode()) {
+                        const cs = getComputedStyle(node.parentNode);
+                        const css = k => cs.getPropertyValue(k);
+                        if (css('display') == 'none') continue;
+                        const k = css('font-family').replace(/"/g, '').replace(/,.*/, '') + ';' +
+                                  css('font-weight') + ';' + css('font-style');
+                        if (k in dict) dict[k] += node.nodeValue;
+                        else dict[k] = node.nodeValue;
+                    }
+                    return dict;
+                ''').items():
+
+                fontspec = tuple(fontstr.split(';'))
+                if fontspec in font_texts:
+                    font_texts[fontspec] |= set(text)
                 else:
-                    all_font_texts[fontspec] = set(text)
+                    font_texts[fontspec] = set(text)
         if args.no_screenshots:
-            logger.info('shutting down web workers early')
-            wpool.close()
-        subset_args = (
-                (get_fontdesc(fonts, fontspec), ''.join(text), options)
-                for fontspec, text in all_font_texts.items())
-        subsetted = fpool.starmap(subset, subset_args)
-        fpool.starmap(write_subset, chain(*subsetted))
-        if not args.no_screenshots:
-            wpool.starmap(verify, screenshots, 1)
-    if multiprocessing.active_children():
-        logger.info('waiting for workers')
-        for proc in multiprocessing.active_children():
-            proc.join()
+            logger.info('shutting down driver')
+            driver.close()
+        with multiprocessing.Pool(min(len(fonts), len(os.sched_getaffinity(0)))) as fpool:
+            jobs = []
+            for fontspec, text in font_texts.items():
+                try:
+                    jobs.append((fonts[fontspec], ''.join(text), options))
+                except KeyError:
+                    logger.warning('missing font %s', fontspec)
+            fpool.starmap(subset, jobs)
+        while screenshots:
+            path, start_png = screenshots.pop()
+            start = Image.open(BytesIO(start_png), formats=('PNG',))
+            logger.info('checking %dx%d screenshot for %s', *start.size, path)
+            driver.get(make_uri(path))
+
+            logger.info('taking post-screenshot for %s', path)
+            driver.set_window_size(*start.size)
+
+            end = Image.open(BytesIO(driver.get_screenshot_as_png()), formats=('PNG',))
+            if ImageChops.difference(start.convert('RGB'), end.convert('RGB')).getbbox():
+                raise Exception(f'screenshots do not match for {path}')
     logger.info('exiting successfully')
 
 if __name__ == '__main__':
author	Alex Xu (Hello71) <alex_y_xu@yahoo.ca>	2021-10-05 15:22:21 -0400
committer	Alex Xu (Hello71) <alex_y_xu@yahoo.ca>	2021-10-05 15:22:21 -0400
commit	746ab5961c023450f7774d568e052cee44946765 (patch)
tree	fc7e69f0db209cdfc747e1eff01fbf3a5cf8a784
parent	12b4e12865088354c94cdf6d6b6c0534185253bd (diff)
download	wfs-746ab5961c023450f7774d568e052cee44946765.tar.xz wfs-746ab5961c023450f7774d568e052cee44946765.zip