blob: 7bcd11dc4c2dc60e13baf84988ff91be14cf999f [file] [log] [blame]
Rahul Ravikumar82028732019-04-23 18:11:25 -07001/*
2 * Copyright 2019 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17import { Request, Response } from 'express';
18import puppeteer = require('puppeteer');
19import { log } from './logger';
20import { ContentNode } from './types';
21import { PlainTextFormatter } from './plain_text_formatter';
22
23// https://github.com/ebidel/try-puppeteer/commit/aacadb54abf861a807e9a71ee54d03abbf21d193
24// use --no-sandbox for some reason
25const CHROME_LAUNCH_ARGS = ['--no-sandbox', '--enable-dom-distiller'];
26
27// A list of DOM Node types that are usually not useful in the context
28// of fetching text content from the page.
29type BannedNames = {
30 [key: string]: true
31};
32
33/**
34 * Handles the actual license request.
35 */
36export async function handleRequest(request: Request, response: Response) {
37 const url = request.body.url;
38 if (url) {
39 try {
40 log(`Handling license request for ${url}`);
41 const nodes = await handleLicenseRequest(url);
42 const content = PlainTextFormatter.plainTextFor(nodes);
43 response.status(200).send(content);
44 } catch (error) {
45 log('Error handling license request ', error);
46 response.status(400).send('Something bad happened. Check the logs');
47 }
48 } else {
49 response.status(400).send('URL required');
50 }
51}
52
53async function handleLicenseRequest(url: string): Promise<ContentNode[]> {
54 const browser = await puppeteer.launch({ args: CHROME_LAUNCH_ARGS });
55 const page = await browser.newPage();
56 await page.goto(url, { waitUntil: 'domcontentloaded' });
57 const content = await page.evaluate(() => {
58 // A map of banned nodes
59 const BANNED_LOCAL_NAMES: BannedNames = {
60 'a': true,
61 'button': true,
62 'canvas': true,
63 'footer': true,
64 'header': true,
65 'code': true,
66 'img': true,
67 'nav': true,
68 'script': true,
69 'style': true,
70 'svg': true,
71 };
72
73 // node list handler
74 function contentForNodeList(list: NodeList | null | undefined): ContentNode[] {
75 const contentNodes: ContentNode[] = [];
76 if (!list) {
77 return contentNodes;
78 }
79
80 for (let i = 0; i < list.length; i += 1) {
81 const node = contentForNode(list.item(i));
82 if (node) {
83 contentNodes.push(node);
84 }
85 }
86 return contentNodes;
87 }
88
89 // content handler
90 const contentWithPath = function (node: ContentNode, accumulator: ContentNode[]) {
91 if (node.textContent && node.textContent.length > 0) {
92 accumulator.push({ localName: node.localName, textContent: node.textContent });
93 }
94 if (node.children) {
95 for (let i = 0; i < node.children.length; i += 1) {
96 contentWithPath(node.children[i], accumulator);
97 }
98 }
99 };
100
101 // node handler
102 function contentForNode(node: Node | null | undefined) {
103 if (!node) {
104 return null;
105 }
106
107 const name = node.nodeName.toLowerCase();
108 // Check if node is banned.
109 if (name && BANNED_LOCAL_NAMES[name] === true) {
110 return null;
111 }
112 // Shallow clone node, as we are only interested in the textContent
113 // of the node, and not the child nodes.
114 const cloned = node.cloneNode();
115 const localName = name;
116 const textContent = cloned.textContent;
117 const children = contentForNodeList(node.childNodes);
118 return {
119 localName: localName,
120 textContent: textContent,
121 children: children
122 };
123 }
124 const body = document.querySelector('body');
125 const nodes: ContentNode[] =
126 body == null ? [] : contentForNodeList(body.childNodes);
127
128 // Accumulate nodes with content
129 const accumulator: ContentNode[] = [];
130 for (let i = 0; i < nodes.length; i += 1) {
131 const node = nodes[i];
132 contentWithPath(node, accumulator);
133 }
134 return accumulator;
135 });
136 await browser.close();
137 return content;
138}