Experimental/prototype license checker tool.

Change-Id: Id309cfde64f556b24c511331756a568a4f4af7c2
diff --git a/services/licenses.php b/services/licenses.php
new file mode 100755
index 0000000..7dda28f
--- /dev/null
+++ b/services/licenses.php
@@ -0,0 +1,291 @@
+<?php
+/*******************************************************************************
+ * Copyright (c) 2019 Eclipse Foundation and others.
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+/*
+ * This file is a prototype for executing a license scan of a bill of
+ * materials. The bill of materials is provided as plain text with each
+ * line containing the one unit of content (i.e., library). Blank lines
+ * and comment lines are skipped (comment lines start with "#" or "//").
+ *
+ * The id for a unit of content may be expressed as Maven coordinates of the form
+ * "groupid:artifactid[:packaging]:version", as abbridged Purl coordinates of the form
+ * "type/namespace/name@version", or as ClearlyDefined coordinates of the
+ * form "type/source/namespace/name/version". Mixing formats is supported.
+ *
+ * Output is expressed as ClearlyDefined ids.
+ *
+ * Two sources of information are used to map ids to license information and other
+ * metadata: first, the Eclipse Foundation data is consulted and then
+ * the ClearlyDefined services are called. Future versions of this script
+ * may consult other sources of data.
+ *
+ * usage example:
+ *
+ * curl -X POST http://localhost/projects/services/licenses.php \
+ * -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
+ *
+ * Note that this works with an instance running on localhost.
+ *
+ * TODO Validate that we can scale to ~4K lines
+ * TODO support file upload.
+ */
+
+
+function normalizeId($id) {
+	$matches = null;
+	if (preg_match('/([\w@\-.]+)(?:\/[\w@\-.]+){4}/', $id, $matches)) return $matches[0];
+	if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches))
+		return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+
+	return null;
+}
+
+function loadFromString($content) {
+	$stream = fopen('php://memory','w+');
+	fwrite($stream, $content);
+	rewind($stream);
+
+	$results = array();
+
+	while ($line = fgets($stream)) {
+		if (empty(trim($line))) break;
+		if (preg_match('/$#', $line)) break;
+		if (preg_match('/$\/\//', $line)) break;
+
+		if ($id = normalizeId(trim($line))) {
+			$results['unmatched'][$id] = array();
+		} else {
+			$results['invalid'][] = $line;
+		}
+	}
+
+	fclose($stream);
+	return $results;
+}
+
+/*
+ * Match against the consolidated data from Eclipse Foundation
+ * sources. The consolidated data is stored in the dashboard
+ * database by a script that runs periodically.
+ *
+ * This function modifies the parameter. As we find matches, the id
+ * is removed from the "unmatched" set and the metadata that we do
+ * find it added to an array by status.
+ *
+ * @see project-services/capture/php/import_third_party_license_data.php
+ */
+function matchAgainstFoundationData(&$results) {
+	foreach(array_keys($results['unmatched']) as $id) {
+		$sql = 'select
+				id, license, status, sourceUrl, definitionUrl, authority, confidence
+			from ThirdPartyLicense
+			where id=":id"';
+		$args = array(':id' => $id);
+		query('dashboard', $sql, $args, function($row) use (&$results) {
+			unset($results['unmatched'][$row['id']]);
+			$results[$row['status']][$row['id']] = $row;
+		});
+	}
+}
+
+/**
+ * Instances of this class represent a single entry in the results
+ * returned by a ClearlyDefined query. Every instance wraps an
+ * array of returned values; this class exists primarily to
+ * encapsulate the means by which the data is queried.
+ */
+class ClearlyDefinedMetadata {
+	var $id, $data;
+
+	public function __construct($id, $data) {
+		$this->id = $id;
+		$this->data = $data;
+	}
+
+	public function getId() {
+		return $this->id;
+	}
+
+	public function getLicense() {
+		// TODO Re-evaluate
+		// For now, we assume the license to be the delared
+		// license. It is possible that some of the content may
+		// be under other licenses. A future implementation may opt
+		// to somehow combine the declared license with those
+		// licenses reported by individual files.
+		if (!isset($this->data['licensed']['declared'])) return null;
+		return $this->data['licensed']['declared'];
+	}
+
+	public function getSourceUrl() {
+		if (!isset($this->data['described']['sourceLocation']['url'])) return null;
+		return $this->data['described']['sourceLocation']['url'];
+	}
+
+	public function getDefinitionUrl() {
+		// TODO Implement me
+		return null;
+	}
+
+	/**
+	 * The score is a numeric value in the range 0-100 that
+	 * indicates the confidence in the license information.
+	 *
+	 * @return number
+	 */
+	public function getScore() {
+		return $this->getEffectiveScore();
+	}
+
+	public function getEffectiveScore() {
+		if (!isset($this->data['scores']['effective'])) return -1;
+		return (int)$this->data['scores']['effective'];
+	}
+
+	public function getProviderUrl() {
+		return $this->providerUrl;
+	}
+}
+
+/**
+ * This is a singleton that manages the Eclipse Foundation's
+ * license white list and provides behaviour for making licensing
+ * decisions in consideration of that white list.
+ */
+class Licenses {
+	var $whitelist;
+
+	public static function load() {
+		// TODO Make configurable (and correct)
+		$json = file_get_contents('http://localhost/legal/licenses.json');
+		$whitelist = json_decode($json, true);
+
+		// Augment the official list with licenses that are acceptable, but
+		// not explicitly included in our approved list.
+		$whitelist['approved']['EPL-1.0'] = "Eclipse Public License, v1.0";
+		$whitelist['approved']['EPL-2.0'] = "Eclipse Public License, v2.0";
+		$whitelist['approved']['WTFPL'] = "WTFPL";
+		$whitelist['approved']['CC-BY-3.0'] = "CC-BY-3.0";
+		$whitelist['approved']['CC-BY-4.0'] = "CC-BY-4.0";
+		$whitelist['approved']['Unlicense'] = "Unlicense";
+		$whitelist['approved']['Artistic-2.0'] = "Artistic-2.0";
+
+		return new self($whitelist);
+	}
+
+	public function __construct($whitelist) {
+		$this->whitelist = $whitelist;
+	}
+
+	/**
+	 * Determine whether or not an SPDX expression is approved per
+	 * our licenses white list. That is, do we consider content that is licensed
+	 * per an SPDX expression something that can be leveraged by an Eclipse
+	 * project.
+	 *
+	 * @param string $spdxExpression
+	 * @return boolean
+	 */
+	public function isApproved($spdxExpression) {
+		// TODO We probably need more sophisticated expression parsing.
+		// This is a quick and dirty "for now" solution. I'm reasoning that
+		// an AND condition is weird and needs attention. Over time, we'll
+		// probably get a better sense of what we can automate. We may also
+		// consider changing this function to answer something other than
+		// a boolean. Perhaps, there is some notion of red/yellow/green
+		// that might be valuable.
+		if (preg_match('/AND/i', $spdxExpression)) return false;
+		foreach(preg_split('/\s+OR\s+/i',$spdxExpression) as $code) {
+			// If we have an entry in the white list, then just return
+			// true.
+			if (isset($this->whitelist['approved'][$code])) return true;
+		}
+		return false;
+	}
+}
+
+function matchAgainstClearlyDefined(&$results) {
+	$licenses = Licenses::load();
+
+	$unmatched = array_keys($results['unmatched']);
+	// FIXME Actually call ClearlyDefined.
+	// $json = json_decode(file_get_contents("/gitroot/dash/org.eclipse.dash.bom/src/test/java/definitions.json"), true);
+	if ($curl = curl_init('https://api.clearlydefined.io/definitions')) {
+		$options = array(
+				CURLOPT_USERAGENT => "Eclipse",
+				CURLOPT_RETURNTRANSFER => true,
+				CURLOPT_FOLLOWLOCATION => true,
+				CURLOPT_RETURNTRANSFER => 1,
+				CURLOPT_POST => TRUE,
+				CURLOPT_CONNECTTIMEOUT => 5,
+				CURLOPT_TIMEOUT => 10,
+				CURLOPT_POSTFIELDS => json_encode($unmatched),
+				CURLOPT_HTTPHEADER => array(
+						'accept: application/json',
+						'Content-Type: application/json')
+		);
+		curl_setopt_array($curl, $options);
+		$contents = curl_exec($curl);
+
+		if ($contents === false) {
+			$results['errors'][] = "ClearlyDefined: " . curl_error($curl);
+		}
+
+		curl_close($curl);
+	} else {
+		$results['errors'][] = "Could not connect to ClearlyDefined.";
+	}
+
+	// If we get a cUrl error, just bail out.
+	if ($contents === false) return;
+
+	$json = json_decode($contents, true);
+
+	foreach($json as $id => $record) {
+		// TODO Investigate why the record sometimes has the _id and sometimes does not.
+		$metadata = new ClearlyDefinedMetadata($id, $record);
+
+		$status = $licenses->isApproved($metadata->getLicense()) ? 'approved' : 'restricted';
+		unset($results['unmatched'][$metadata->getId()]);
+		$results[$status][$metadata->getId()] = array(
+				'id' => $metadata->getId(),
+				'license' => $metadata->getLicense(),
+				'status' => $status,
+				'sourceUrl' => $metadata->getSourceUrl(),
+				'definitionUrl' => $metadata->getDefinitionUrl(),
+				'authority' => 'clearlydefined',
+				'confidence' => $metadata->getScore()
+		);
+	}
+}
+
+header ("Content-type: text/csv");
+header ("Content-Disposition: \"inline; filename=licenses.csv\"");
+
+if (!isset($_POST['content'])) return;
+
+require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
+$App = new App ();
+
+require_once dirname(__FILE__) . "/../classes/database.inc";
+
+$content = @$_POST['content'];
+
+if ($content) {
+	$results = loadFromString($content);
+	matchAgainstFoundationData($results);
+	matchAgainstClearlyDefined($results);
+} else {
+	$results = array();
+}
+
+echo json_encode($results);
+
+?>
diff --git a/tools/license.php b/tools/license.php
new file mode 100644
index 0000000..c244631
--- /dev/null
+++ b/tools/license.php
@@ -0,0 +1,46 @@
+<?php
+/**
+ * Copyright (c) Eclipse Foundation and others.
+ *
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ */
+
+require_once($_SERVER['DOCUMENT_ROOT'] . "/eclipse.org-common/system/app.class.php");
+require_once($_SERVER['DOCUMENT_ROOT'] . "/eclipse.org-common/system/nav.class.php");
+require_once($_SERVER['DOCUMENT_ROOT'] . "/eclipse.org-common/system/menu.class.php");
+$App 	= new App();
+$Nav	= new Nav();
+$Menu 	= new Menu();
+include($App->getProjectCommon());
+
+require_once (dirname ( __FILE__ ) . '/../classes/debug.php');
+
+mustBeFoundationEmployee();
+
+$pageTitle 		= "License Checker";
+$pageKeywords	= "";
+$pageAuthor		= "Wayne Beaton";
+
+ob_start();
+?>
+<div id="maincontent">
+<div id="midcolumn">
+<h1><?= $pageTitle ?></h1>
+
+<form method="POST" action="../services/licenses.php">
+<textarea name="content" rows="25" cols="80"></textarea><br/><br/>
+<input type="submit" value="Submit"/>
+</form>
+
+</div>
+</div>
+
+<?php
+$html = ob_get_contents();
+ob_end_clean();
+$App->generatePage($theme, $Menu, $Nav, $pageAuthor, $pageKeywords, $pageTitle, $html);
+?>