Technology Sharing

Puppeteer generates images to generate PDF

2024-07-12

한어Русский языкEnglishFrançaisIndonesianSanskrit日本語DeutschPortuguêsΕλληνικάespañolItalianoSuomalainenLatina

Project Background

In 2022, due to the limited team permissions, we still needed to generate images without opening new pages or services. The performance requirements were also relatively high. At that time, we adopted the Html2canvas front-end generation solution and made more than a dozen core business modules. Under Web, H5, App, and Electron PC App, the speed was 300ms~2500ms to generate and download images.

This year, we encountered the same requirement. However, since the business embedded dozens of large-screen Iframes composed of charts, Html2canvas had some difficulty in solving the Iframe problem. We also considered the solution of using the server to take screenshots with team permissions, so we started with Puppeteer.

Additional instructions for generating PDF

Generating PDF with puppeteer doesn't work, so the solution uses sharp to manually cut the images generated by puppeteer into proportional sizes of the PDF page size, and then uses pdfkit scale to proportionally stuff the cut images into the PDF.

Code

The code is a POC solution. Before it can be used in production, more support work on stability and availability is needed. Therefore, the following code is only for learning reference.

The process code is not very meaningful for reference. The core code of cropping pictures, automatic scrolling, waiting for the specified Iframe to load, and obtaining the attribute value of the specified element are more meaningful for reference.

// 流程代码
import { ConsoleLogger } from '@nestjs/common';
import puppeteer, { Viewport } from 'puppeteer';
import { Browser, CookieParam } from 'puppeteer';
import * as helper from './helper';
const logger = new ConsoleLogger('WebViewer');

export enum TargetFileType {
  png = 'png',
  jpeg = 'jpeg',
  webp = 'webp',
  pdf = 'pdf',
}

export interface ShotParam {
  type: TargetFileType;
  url: string;
  targetFile?: string;
  cookies?: CookieParam[];
  viewport?: Viewport;
  timeout?: number;
}

export class WebViewer {
  private browser: Browser | null = null;
  private inited = false;

  static defaultInstance = new WebViewer();

  constructor(private readonly options?: WebViewerOptions) {
    this.options = options || {
      socketTimeout: 2 * 60 * 1000,
      generateTimeout: 0,
    };
  }

  async init(): Promise<void> {
    this.browser = await puppeteer.launch({
      // defaultViewport: { width: 1920, height: 1080 },
      // headless: 'shell',
      headless: false,
      pipe: true,
      args: [
        '--disable-gpu',
        '--disable-dev-shm-usage',
        '--disable-setuid-sandbox',
        '--no-first-run',
        '--no-sandbox',
        '--no-zygote',
        '--full-memory-crash-report',
        '--unlimited-storage',
      ],
    });
    this.inited = true;
  }

  async shot(param: ShotParam): Promise<Buffer> {
    if (!this.inited) {
      await this.init();
    }

    logger.log(
      `Start to shot url: ${param.url}, type: ${
        param.type
      }, viewport: { heigth:${param.viewport?.height || 0}, width:${
        param.viewport?.width || 0
      }} `,
    );
    const page = await this.browser.newPage();
    page.on('response', (response) => {
      logger.debug(response.url());
    });
    page.on('close', () => {
      logger.debug('Current page has been closed.');
    });
    if (param.cookies) {
      await page.setCookie(...param.cookies);
    }
    await page.goto(param.url, {
      timeout: param?.timeout || this.options.socketTimeout,
      waitUntil: 'networkidle0',
    });
    await helper.waitForFrame(page);
    const minWidth = 1920;
    let width = await page.$eval('.html-table', (el) => el.scrollWidth + 36);
    width = width > minWidth ? width : minWidth;
    param.viewport = { width, height: 1080 };
    await page.setViewport(param.viewport);
    await helper.getValueFromElementDataset(
      page,
      'html',
      'height',
      async (value: string) => !Number.isNaN(Number(value)),
    );
    const bodyHandle = await page.$('body');
    const { height: bodyHeight } = await bodyHandle.boundingBox();
    param.viewport = { width, height: Math.floor(bodyHeight) + 1 };
    await bodyHandle.dispose();
    await page.setViewport(param.viewport);
    await page.waitForSelector('#datart-rendered');
    await helper.sleep(300);

    let buffer = await page.screenshot();
    if (param.type === 'pdf') {
      buffer = await helper.generatePdf(buffer);
    }
    await helper.sleep(300);
    await page.close();
    return buffer;
  }

  async close(): Promise<void> {
    await this.browser?.close();
  }
}

export interface WebViewerOptions {
  socketTimeout: number;
  generateTimeout?: number;
}

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
// 核心代码
import { Page } from 'puppeteer';
import * as sharp from 'sharp';
import * as pdfkit from 'pdfkit';
import * as getStream from 'get-stream';

function waitForFrame(page: Page) {
  let fulfill;
  const promise = new Promise((resolve) => (fulfill = resolve));
  checkFrame();
  return promise;

  function checkFrame() {
    const frame = page.frames().find((f) => {
      console.log(f.name());
      return f.name() === 'datart';
    });
    if (frame) fulfill(frame);
    else page.once('frameattached', checkFrame);
  }
}

async function autoScroll(page: Page, selector: string) {
  return page.evaluate((selector) => {
    return new Promise((resolve) => {
      //滚动的总高度
      let totalHeight = 0;
      //每次向下滚动的高度 100 px
      const distance = 100;
      const timer = setInterval(() => {
        const dom = document.querySelector(selector);
        if (!dom) {
          return clearInterval(timer);
        }
        //页面的高度 包含滚动高度
        const scrollHeight = dom.scrollHeight;
        console.log(scrollHeight);
        //滚动条向下滚动 distance
        dom.scrollBy(0, distance);
        totalHeight += distance;
        //当滚动的总高度 大于 页面高度 说明滚到底了。也就是说到滚动条滚到底时,以上还会继续累加,直到超过页面高度
        if (totalHeight >= scrollHeight) {
          clearInterval(timer);
          resolve(true);
        }
      }, 100);
    });
  }, selector);
}

async function getValueFromElementDataset(
  page: Page,
  selector: string,
  key: string,
  checkValue: (value: string) => Promise<boolean>,
) {
  return new Promise((resolve) => {
    const interval = setInterval(async () => {
      const value = await page.$eval(
        selector,
        (el: HTMLElement, key: string) => {
          return el.dataset[key];
        },
        key,
      );
      if (checkValue && !(await checkValue(value))) {
        return;
      } else {
        clearInterval(interval);
        resolve(value);
      }
    }, 500);
  });
}

async function clipImage(
  pageWidth: number,
  pageHeight: number,
  buffer: Buffer,
): Promise<{ images: Buffer[]; scale: number }> {
  const imageOriginSharp = sharp(buffer);
  const imageSharp = sharp(buffer).resize(pageWidth);

  const imageBuffer = await imageSharp
    .withMetadata()
    .toBuffer({ resolveWithObject: true });

  const imageOriginBuffer = await imageOriginSharp
    .withMetadata()
    .toBuffer({ resolveWithObject: true });

  // const imageWidth = imageBuffer.info.width;
  const imageHeight = imageBuffer.info.height;
  const imageOriginWidth = imageOriginBuffer.info.width;
  const imageOriginHeight = imageOriginBuffer.info.height;

  const scale = imageOriginHeight / imageHeight;

  const images: Buffer[] = [];
  console.log({
    imageOriginWidth,
    imageOriginHeight,
    scale,
  });

  let startY = 0;
  while (startY < imageHeight) {
    const height = Math.min(pageHeight, imageHeight - startY);
    console.log(Math.ceil(height * scale), Math.ceil(startY * scale));
    const imageOriginSharp = sharp(buffer);
    const liteImage = await imageOriginSharp
      .extract({
        width: imageOriginWidth,
        height: Math.ceil(height * scale),
        left: 0,
        top: Math.floor(startY * scale),
      })
      .toBuffer();

    images.push(liteImage);
    startY += height;
  }

  return { images, scale };
}

const generatePdf = (imageBuffer: Buffer) => {
  return new Promise<Buffer>(async (resolve) => {
    const doc = new pdfkit();

    // 获取PDF页面的宽度和高度
    const pageWidth = doc.page.width;
    const pageHeight = doc.page.height;

    // 图片按 page 高度等比放缩然后裁切为多份,然后通过 doc addpage 以及 doc.image 把每张图放入 pdf
    const { images, scale } = await clipImage(
      pageWidth,
      pageHeight,
      imageBuffer,
    );
    let index = 1;

    for (const image of images) {
      doc.image(image, 0, 0, {
        width: pageWidth,
        scale: 1 / scale,
      });
      if (index < images.length) {
        doc.addPage();
        doc.switchToPage(index);
        index += 1;
      }
    }

    doc.end();
    const pdfBuffer = await getStream.buffer(doc);
    resolve(pdfBuffer);
  });
};

const sleep = (time: number) => new Promise((r) => setTimeout(r, time));

export {
  sleep,
  clipImage,
  autoScroll,
  generatePdf,
  waitForFrame,
  getValueFromElementDataset,
};

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
  • 100
  • 101
  • 102
  • 103
  • 104
  • 105
  • 106
  • 107
  • 108
  • 109
  • 110
  • 111
  • 112
  • 113
  • 114
  • 115
  • 116
  • 117
  • 118
  • 119
  • 120
  • 121
  • 122
  • 123
  • 124
  • 125
  • 126
  • 127
  • 128
  • 129
  • 130
  • 131
  • 132
  • 133
  • 134
  • 135
  • 136
  • 137
  • 138
  • 139
  • 140
  • 141
  • 142
  • 143
  • 144
  • 145
  • 146
  • 147
  • 148
  • 149
  • 150
  • 151
  • 152
  • 153
  • 154
  • 155
  • 156
  • 157
  • 158
  • 159
  • 160
  • 161
  • 162
  • 163
  • 164
  • 165
  • 166
  • 167
  • 168
  • 169
  • 170
  • 171