Experimental/prototype license checker tool.
Change-Id: Id309cfde64f556b24c511331756a568a4f4af7c2
diff --git a/services/licenses.php b/services/licenses.php
new file mode 100755
index 0000000..7dda28f
--- /dev/null
+++ b/services/licenses.php
@@ -0,0 +1,291 @@
+<?php
+/*******************************************************************************
+ * Copyright (c) 2019 Eclipse Foundation and others.
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+/*
+ * This file is a prototype for executing a license scan of a bill of
+ * materials. The bill of materials is provided as plain text with each
+ * line containing the one unit of content (i.e., library). Blank lines
+ * and comment lines are skipped (comment lines start with "#" or "//").
+ *
+ * The id for a unit of content may be expressed as Maven coordinates of the form
+ * "groupid:artifactid[:packaging]:version", as abbridged Purl coordinates of the form
+ * "type/namespace/name@version", or as ClearlyDefined coordinates of the
+ * form "type/source/namespace/name/version". Mixing formats is supported.
+ *
+ * Output is expressed as ClearlyDefined ids.
+ *
+ * Two sources of information are used to map ids to license information and other
+ * metadata: first, the Eclipse Foundation data is consulted and then
+ * the ClearlyDefined services are called. Future versions of this script
+ * may consult other sources of data.
+ *
+ * usage example:
+ *
+ * curl -X POST http://localhost/projects/services/licenses.php \
+ * -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
+ *
+ * Note that this works with an instance running on localhost.
+ *
+ * TODO Validate that we can scale to ~4K lines
+ * TODO support file upload.
+ */
+
+
+function normalizeId($id) {
+ $matches = null;
+ if (preg_match('/([\w@\-.]+)(?:\/[\w@\-.]+){4}/', $id, $matches)) return $matches[0];
+ if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches))
+ return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+
+ return null;
+}
+
+function loadFromString($content) {
+ $stream = fopen('php://memory','w+');
+ fwrite($stream, $content);
+ rewind($stream);
+
+ $results = array();
+
+ while ($line = fgets($stream)) {
+ if (empty(trim($line))) break;
+ if (preg_match('/$#', $line)) break;
+ if (preg_match('/$\/\//', $line)) break;
+
+ if ($id = normalizeId(trim($line))) {
+ $results['unmatched'][$id] = array();
+ } else {
+ $results['invalid'][] = $line;
+ }
+ }
+
+ fclose($stream);
+ return $results;
+}
+
+/*
+ * Match against the consolidated data from Eclipse Foundation
+ * sources. The consolidated data is stored in the dashboard
+ * database by a script that runs periodically.
+ *
+ * This function modifies the parameter. As we find matches, the id
+ * is removed from the "unmatched" set and the metadata that we do
+ * find it added to an array by status.
+ *
+ * @see project-services/capture/php/import_third_party_license_data.php
+ */
+function matchAgainstFoundationData(&$results) {
+ foreach(array_keys($results['unmatched']) as $id) {
+ $sql = 'select
+ id, license, status, sourceUrl, definitionUrl, authority, confidence
+ from ThirdPartyLicense
+ where id=":id"';
+ $args = array(':id' => $id);
+ query('dashboard', $sql, $args, function($row) use (&$results) {
+ unset($results['unmatched'][$row['id']]);
+ $results[$row['status']][$row['id']] = $row;
+ });
+ }
+}
+
+/**
+ * Instances of this class represent a single entry in the results
+ * returned by a ClearlyDefined query. Every instance wraps an
+ * array of returned values; this class exists primarily to
+ * encapsulate the means by which the data is queried.
+ */
+class ClearlyDefinedMetadata {
+ var $id, $data;
+
+ public function __construct($id, $data) {
+ $this->id = $id;
+ $this->data = $data;
+ }
+
+ public function getId() {
+ return $this->id;
+ }
+
+ public function getLicense() {
+ // TODO Re-evaluate
+ // For now, we assume the license to be the delared
+ // license. It is possible that some of the content may
+ // be under other licenses. A future implementation may opt
+ // to somehow combine the declared license with those
+ // licenses reported by individual files.
+ if (!isset($this->data['licensed']['declared'])) return null;
+ return $this->data['licensed']['declared'];
+ }
+
+ public function getSourceUrl() {
+ if (!isset($this->data['described']['sourceLocation']['url'])) return null;
+ return $this->data['described']['sourceLocation']['url'];
+ }
+
+ public function getDefinitionUrl() {
+ // TODO Implement me
+ return null;
+ }
+
+ /**
+ * The score is a numeric value in the range 0-100 that
+ * indicates the confidence in the license information.
+ *
+ * @return number
+ */
+ public function getScore() {
+ return $this->getEffectiveScore();
+ }
+
+ public function getEffectiveScore() {
+ if (!isset($this->data['scores']['effective'])) return -1;
+ return (int)$this->data['scores']['effective'];
+ }
+
+ public function getProviderUrl() {
+ return $this->providerUrl;
+ }
+}
+
+/**
+ * This is a singleton that manages the Eclipse Foundation's
+ * license white list and provides behaviour for making licensing
+ * decisions in consideration of that white list.
+ */
+class Licenses {
+ var $whitelist;
+
+ public static function load() {
+ // TODO Make configurable (and correct)
+ $json = file_get_contents('http://localhost/legal/licenses.json');
+ $whitelist = json_decode($json, true);
+
+ // Augment the official list with licenses that are acceptable, but
+ // not explicitly included in our approved list.
+ $whitelist['approved']['EPL-1.0'] = "Eclipse Public License, v1.0";
+ $whitelist['approved']['EPL-2.0'] = "Eclipse Public License, v2.0";
+ $whitelist['approved']['WTFPL'] = "WTFPL";
+ $whitelist['approved']['CC-BY-3.0'] = "CC-BY-3.0";
+ $whitelist['approved']['CC-BY-4.0'] = "CC-BY-4.0";
+ $whitelist['approved']['Unlicense'] = "Unlicense";
+ $whitelist['approved']['Artistic-2.0'] = "Artistic-2.0";
+
+ return new self($whitelist);
+ }
+
+ public function __construct($whitelist) {
+ $this->whitelist = $whitelist;
+ }
+
+ /**
+ * Determine whether or not an SPDX expression is approved per
+ * our licenses white list. That is, do we consider content that is licensed
+ * per an SPDX expression something that can be leveraged by an Eclipse
+ * project.
+ *
+ * @param string $spdxExpression
+ * @return boolean
+ */
+ public function isApproved($spdxExpression) {
+ // TODO We probably need more sophisticated expression parsing.
+ // This is a quick and dirty "for now" solution. I'm reasoning that
+ // an AND condition is weird and needs attention. Over time, we'll
+ // probably get a better sense of what we can automate. We may also
+ // consider changing this function to answer something other than
+ // a boolean. Perhaps, there is some notion of red/yellow/green
+ // that might be valuable.
+ if (preg_match('/AND/i', $spdxExpression)) return false;
+ foreach(preg_split('/\s+OR\s+/i',$spdxExpression) as $code) {
+ // If we have an entry in the white list, then just return
+ // true.
+ if (isset($this->whitelist['approved'][$code])) return true;
+ }
+ return false;
+ }
+}
+
+function matchAgainstClearlyDefined(&$results) {
+ $licenses = Licenses::load();
+
+ $unmatched = array_keys($results['unmatched']);
+ // FIXME Actually call ClearlyDefined.
+ // $json = json_decode(file_get_contents("/gitroot/dash/org.eclipse.dash.bom/src/test/java/definitions.json"), true);
+ if ($curl = curl_init('https://api.clearlydefined.io/definitions')) {
+ $options = array(
+ CURLOPT_USERAGENT => "Eclipse",
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_FOLLOWLOCATION => true,
+ CURLOPT_RETURNTRANSFER => 1,
+ CURLOPT_POST => TRUE,
+ CURLOPT_CONNECTTIMEOUT => 5,
+ CURLOPT_TIMEOUT => 10,
+ CURLOPT_POSTFIELDS => json_encode($unmatched),
+ CURLOPT_HTTPHEADER => array(
+ 'accept: application/json',
+ 'Content-Type: application/json')
+ );
+ curl_setopt_array($curl, $options);
+ $contents = curl_exec($curl);
+
+ if ($contents === false) {
+ $results['errors'][] = "ClearlyDefined: " . curl_error($curl);
+ }
+
+ curl_close($curl);
+ } else {
+ $results['errors'][] = "Could not connect to ClearlyDefined.";
+ }
+
+ // If we get a cUrl error, just bail out.
+ if ($contents === false) return;
+
+ $json = json_decode($contents, true);
+
+ foreach($json as $id => $record) {
+ // TODO Investigate why the record sometimes has the _id and sometimes does not.
+ $metadata = new ClearlyDefinedMetadata($id, $record);
+
+ $status = $licenses->isApproved($metadata->getLicense()) ? 'approved' : 'restricted';
+ unset($results['unmatched'][$metadata->getId()]);
+ $results[$status][$metadata->getId()] = array(
+ 'id' => $metadata->getId(),
+ 'license' => $metadata->getLicense(),
+ 'status' => $status,
+ 'sourceUrl' => $metadata->getSourceUrl(),
+ 'definitionUrl' => $metadata->getDefinitionUrl(),
+ 'authority' => 'clearlydefined',
+ 'confidence' => $metadata->getScore()
+ );
+ }
+}
+
+header ("Content-type: text/csv");
+header ("Content-Disposition: \"inline; filename=licenses.csv\"");
+
+if (!isset($_POST['content'])) return;
+
+require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
+$App = new App ();
+
+require_once dirname(__FILE__) . "/../classes/database.inc";
+
+$content = @$_POST['content'];
+
+if ($content) {
+ $results = loadFromString($content);
+ matchAgainstFoundationData($results);
+ matchAgainstClearlyDefined($results);
+} else {
+ $results = array();
+}
+
+echo json_encode($results);
+
+?>
diff --git a/tools/license.php b/tools/license.php
new file mode 100644
index 0000000..c244631
--- /dev/null
+++ b/tools/license.php
@@ -0,0 +1,46 @@
+<?php
+/**
+ * Copyright (c) Eclipse Foundation and others.
+ *
+ * This program and the accompanying materials are made
+ * available under the terms of the Eclipse Public License 2.0
+ * which is available at https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ */
+
+require_once($_SERVER['DOCUMENT_ROOT'] . "/eclipse.org-common/system/app.class.php");
+require_once($_SERVER['DOCUMENT_ROOT'] . "/eclipse.org-common/system/nav.class.php");
+require_once($_SERVER['DOCUMENT_ROOT'] . "/eclipse.org-common/system/menu.class.php");
+$App = new App();
+$Nav = new Nav();
+$Menu = new Menu();
+include($App->getProjectCommon());
+
+require_once (dirname ( __FILE__ ) . '/../classes/debug.php');
+
+mustBeFoundationEmployee();
+
+$pageTitle = "License Checker";
+$pageKeywords = "";
+$pageAuthor = "Wayne Beaton";
+
+ob_start();
+?>
+<div id="maincontent">
+<div id="midcolumn">
+<h1><?= $pageTitle ?></h1>
+
+<form method="POST" action="../services/licenses.php">
+<textarea name="content" rows="25" cols="80"></textarea><br/><br/>
+<input type="submit" value="Submit"/>
+</form>
+
+</div>
+</div>
+
+<?php
+$html = ob_get_contents();
+ob_end_clean();
+$App->generatePage($theme, $Menu, $Nav, $pageAuthor, $pageKeywords, $pageTitle, $html);
+?>