Add error handling.
Change-Id: Icf244900c17b4d47b19dfb3208dd7e7b5c61bf0e
diff --git a/services/license_check.php b/services/license_check.php
new file mode 100755
index 0000000..df6a36b
--- /dev/null
+++ b/services/license_check.php
@@ -0,0 +1,246 @@
+<?php
+/*******************************************************************************
+ * Copyright (c) 2019 Eclipse Foundation and others.
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+/*
+ * This file is a prototype for executing a license scan of a bill of
+ * materials. The bill of materials is provided as plain text with each
+ * line containing the one unit of content (i.e., library). Blank lines
+ * and comment lines are skipped (comment lines start with "#" or "//").
+ *
+ * The id for a unit of content may be expressed as Maven coordinates of the form
+ * "groupid:artifactid[:packaging]:version", as abbridged Purl coordinates of the form
+ * "type/namespace/name@version", or as ClearlyDefined coordinates of the
+ * form "type/source/namespace/name/version". Mixing formats is supported.
+ *
+ * Output is expressed as ClearlyDefined ids.
+ *
+ * Two sources of information are used to map ids to license information and other
+ * metadata: first, the Eclipse Foundation data is consulted and then
+ * the ClearlyDefined services are called. Future versions of this script
+ * may consult other sources of data.
+ *
+ * usage example:
+ *
+ * curl -X POST http://localhost/projects/services/licenses.php \
+ * -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
+ *
+ * curl -X POST "http://localhost/projects/services/licenses.php" \
+ * -d "content=`mvn dependency:list -DskipTests -Dmaven.javadoc.skip=true | grep -Poh '\S+(?=:compile)' | sort | uniq`" | jsonpp | less
+ *
+ * yarn list | grep -Poh "(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)" \
+ * | curl -X POST "http://localhost/projects/services/licenses.php?XDEBUG_SESSION_START=ECLIPSE_DBGP" \
+ * --data-urlencode content@- | jsonpp | less
+ *
+ * Note that this works with an instance running on localhost.
+ *
+ * TODO Validate that we can scale to ~4K lines
+ * TODO support file upload.
+ * TODO Customize license list based on project (e.g., science.* can use LGPL)
+ * TODO Parameters for default type and provider.
+ */
+
+/**
+ * Try to massage the content identifier into ClearlyDefined coordinates. That is,
+ * for example, recognize Maven coordinates of the form <em>groupid:artifactid:version</em>, and
+ * convert them into the ClearlyDefined equivalent, <em>maven:mavencentral:groupid:artifactid:version</em>.
+ *
+ * Answers <code>null<code> when the id is not recognized.
+ *
+ * @param string $id
+ * @return NULL|string
+ */
+function normalizeId($id) {
+ $matches = null;
+
+ // Just pass through anything that's already in a ClearlyDefined coordinate form.
+ if (preg_match('/([\w@\-.]+)(?:\/[\w@\-.]+){4}/', $id, $matches)) return $matches[0];
+
+ /*
+ * Deal with Maven coordinates. There is a special case that we need to deal with
+ * because of Tycho using p2 repositories. When the coordinates start with
+ * "p2.eclipse-plugin" or "p2.eclipse-feature", we generate an id for p2/orbit.
+ */
+ if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches)) {
+ if (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $matches[1])) {
+ return "p2/orbit/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+ }
+ return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+ }
+ /*
+ * Some p2 cases (org.apache.ant only, I think) provide all of the information that
+ * we need to match against Maven directly.
+ *
+ * e.g. "p2.eclipse-plugin:org.apache.ant:jar:lib/ant-jsch.jar:1.10.5.v20190526-1402"
+ * maps to "maven/mavencentral/org.apache.ant/ant-jsch/1.10.5"
+ */
+ if (preg_match('/p2.eclipse-plugin:([\w@\-.]+)(?::[\w@\-.]+)?:lib\/([\w@\-.]+).jar:(\d+(?:\.\d+)*)/', $id, $matches)) {
+ return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+ }
+
+ /*
+ * Deal with pURL coordinates. These take the form "namespace/name@version",
+ * e.g., "@webassemblyjs/wast-printer@1.7.8". The namespace is optional; when
+ * absent, we use a dash ("-").
+ *
+ * I was surprised to see range qualifiers with some of the versions provided
+ * from yarn (<code>yarn list</code>). For now, we just ignore them.
+ *
+ * FIXME Sort out what to do with the range qualifiers
+ *
+ * FIXME Don't assume that values provided in pURL format are NPM.
+ */
+ if (preg_match('/(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)/', $id, $matches)) {
+ $namespace = empty($matches[1]) ? '-' : $matches[1];
+ return "npm/npmjs/{$namespace}/{$matches[2]}/{$matches[3]}";
+ }
+
+ return null;
+}
+
+function loadFromString($content) {
+ $stream = fopen('php://memory','w+');
+ fwrite($stream, $content);
+ rewind($stream);
+
+ $results = array();
+
+ while ($line = trim(fgets($stream))) {
+ if (empty($line)) break;
+ if (preg_match('/$#', $line)) break;
+ if (preg_match('/$\/\//', $line)) break;
+
+ if ($id = normalizeId(trim($line))) {
+ $results['unmatched'][$id] = array();
+ } else {
+ $results['invalid'][] = $line;
+ }
+ }
+
+ fclose($stream);
+ return $results;
+}
+
+function loadFromPackageLockString($content) {
+ $json = json_decode($content, true);
+ $results = array();
+ visitPackages($json, function($id) use (&$results) {
+ $results['unmatched'][$id] = array();
+ });
+ return $results;
+}
+
+function visitPackages($root, Callable $callback) {
+ if (!isset($root['dependencies'])) return;
+
+ foreach($root['dependencies'] as $name => $data) {
+ $matches = null;
+ if (preg_match('/^(.+)\/(.+)$/', $name, $matches)) {
+ $namespace = $matches[1];
+ $name = $matches[2];
+ } else {
+ $namespace = '-';
+ $name = $name;
+ }
+ $id = "npm/npmjs/{$namespace}/{$name}/{$data['version']}";
+ call_user_func($callback, $id);
+ visitPackages($data, $callback);
+ }
+}
+
+function matchAgainstEclipseProjects(&$results) {
+ foreach(array_keys($results['unmatched']) as $id) {
+ if ($parts = preg_split('/\//', $id)) {
+ if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[2])) {
+ unset($results['unmatched'][$id]);
+ $results['approved'][$id] = array(
+ 'id' => $id,
+ 'license' => '',
+ 'status' => 'approved',
+ 'sourceUrl' => '',
+ 'definitionUrl' => '',
+ 'authority' => 'eclipse',
+ 'confidence' => 90
+ );
+ } elseif (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $parts[2])) {
+ if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[3])) {
+ unset($results['unmatched'][$id]);
+ $results['approved'][$id] = array(
+ 'id' => $id,
+ 'license' => '',
+ 'status' => 'approved',
+ 'sourceUrl' => '',
+ 'definitionUrl' => '',
+ 'authority' => 'eclipse',
+ 'confidence' => 90
+ );
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Match against the consolidated data from Eclipse Foundation
+ * sources. The consolidated data is stored in the dashboard
+ * database by a script that runs periodically.
+ *
+ * This function modifies the parameter. As we find matches, the id
+ * is removed from the "unmatched" set and the metadata that we do
+ * find it added to an array by status.
+ *
+ * @see project-services/capture/php/import_third_party_license_data.php
+ */
+function matchAgainstFoundationData(&$results) {
+ foreach(array_keys($results['unmatched']) as $id) {
+ $sql = 'select
+ id, license, status, sourceUrl, definitionUrl, authority, confidence
+ from ThirdPartyLicense
+ where id=":id"';
+ $args = array(':id' => $id);
+ query('dashboard', $sql, $args, function($row) use (&$results) {
+ unset($results['unmatched'][$row['id']]);
+ $results[$row['status']][$row['id']] = $row;
+ });
+ }
+}
+
+
+// Everything above this line could (and probably should) be factored out.
+
+header ("Content-type: text/csv");
+header ("Content-Disposition: \"inline; filename=licenses.csv\"");
+
+require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
+$App = new App ();
+
+require_once dirname(__FILE__) . "/../classes/database.inc";
+
+if ($content = @$_POST['content']) {
+ $results = loadFromString($content);
+}
+
+if ($content = @$_POST['package-lock']) {
+ $results = loadFromPackageLockString($content);
+}
+
+if ($content = @$_POST['json']) {
+ $results = array('unmatched' => array_fill_keys(json_decode($content, true),array()));
+}
+
+if ($results) {
+ matchAgainstEclipseProjects($results);
+ matchAgainstFoundationData($results);
+} else {
+ $results = array();
+}
+
+echo json_encode($results);
+
+?>
diff --git a/services/licenses.php b/services/licenses.php
index 27726bc..c81e398 100755
--- a/services/licenses.php
+++ b/services/licenses.php
@@ -31,10 +31,19 @@
* curl -X POST http://localhost/projects/services/licenses.php \
* -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
*
+ * curl -X POST "http://localhost/projects/services/licenses.php" \
+ * -d "content=`mvn dependency:list -DskipTests -Dmaven.javadoc.skip=true | grep -Poh '\S+(?=:compile)' | sort | uniq`" | jsonpp | less
+ *
+ * yarn list | grep -Poh "(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)" \
+ * | curl -X POST "http://localhost/projects/services/licenses.php?XDEBUG_SESSION_START=ECLIPSE_DBGP" \
+ * --data-urlencode content@- | jsonpp | less
+ *
* Note that this works with an instance running on localhost.
*
* TODO Validate that we can scale to ~4K lines
* TODO support file upload.
+ * TODO Customize license list based on project (e.g., science.* can use LGPL)
+ * TODO Parameters for default type and provider.
*/
/**
@@ -58,7 +67,7 @@
* because of Tycho using p2 repositories. When the coordinates start with
* "p2.eclipse-plugin" or "p2.eclipse-feature", we generate an id for p2/orbit.
*/
- if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches)) {
+ if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+){0,2}:(\d+(?:\.\d+)*)/', $id, $matches)) {
if (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $matches[1])) {
return "p2/orbit/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
@@ -74,7 +83,24 @@
if (preg_match('/p2.eclipse-plugin:([\w@\-.]+)(?::[\w@\-.]+)?:lib\/([\w@\-.]+).jar:(\d+(?:\.\d+)*)/', $id, $matches)) {
return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
}
- // TODO add Purl coordinates
+
+ /*
+ * Deal with pURL coordinates. These take the form "namespace/name@version",
+ * e.g., "@webassemblyjs/wast-printer@1.7.8". The namespace is optional; when
+ * absent, we use a dash ("-").
+ *
+ * I was surprised to see range qualifiers with some of the versions provided
+ * from yarn (<code>yarn list</code>). For now, we just ignore them.
+ *
+ * FIXME Sort out what to do with the range qualifiers
+ *
+ * FIXME Don't assume that values provided in pURL format are NPM.
+ */
+ if (preg_match('/(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)/', $id, $matches)) {
+ $namespace = empty($matches[1]) ? '-' : $matches[1];
+ return "npm/npmjs/{$namespace}/{$matches[2]}/{$matches[3]}";
+ }
+
return null;
}
@@ -85,15 +111,15 @@
$results = array();
- while ($line = fgets($stream)) {
- if (empty(trim($line))) break;
+ while ($line = trim(fgets($stream))) {
+ if (empty($line)) break;
if (preg_match('/$#', $line)) break;
if (preg_match('/$\/\//', $line)) break;
if ($id = normalizeId(trim($line))) {
$results['unmatched'][$id] = array();
} else {
- $results['invalid'][] = $line;
+ $results['invalid'][$line] = array();
}
}
@@ -101,10 +127,37 @@
return $results;
}
+function loadFromPackageLockString($content) {
+ $json = json_decode($content, true);
+ $results = array();
+ visitPackages($json, function($id) use (&$results) {
+ $results['unmatched'][$id] = array();
+ });
+ return $results;
+}
+
+function visitPackages($root, Callable $callback) {
+ if (!isset($root['dependencies'])) return;
+
+ foreach($root['dependencies'] as $name => $data) {
+ $matches = null;
+ if (preg_match('/^(.+)\/(.+)$/', $name, $matches)) {
+ $namespace = $matches[1];
+ $name = $matches[2];
+ } else {
+ $namespace = '-';
+ $name = $name;
+ }
+ $id = "npm/npmjs/{$namespace}/{$name}/{$data['version']}";
+ call_user_func($callback, $id);
+ visitPackages($data, $callback);
+ }
+}
+
function matchAgainstEclipseProjects(&$results) {
foreach(array_keys($results['unmatched']) as $id) {
if ($parts = preg_split('/\//', $id)) {
- if (preg_match('/^org\.eclipse/', $parts[2])) {
+ if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[2])) {
unset($results['unmatched'][$id]);
$results['approved'][$id] = array(
'id' => $id,
@@ -116,7 +169,7 @@
'confidence' => 90
);
} elseif (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $parts[2])) {
- if (preg_match('/^org\.eclipse/', $parts[3])) {
+ if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[3])) {
unset($results['unmatched'][$id]);
$results['approved'][$id] = array(
'id' => $id,
@@ -284,68 +337,74 @@
function matchAgainstClearlyDefined(&$results) {
$licenses = Licenses::load();
- $unmatched = array();
- foreach($results['unmatched'] as $id => $ignore) {
- /*
- * ClearlyDefined has a problem with resource types that it
- * doesn't know about. Let's prune out the p2 ones.
- */
- if (preg_match('/^p2\//', $id)) continue;
- $unmatched[] = $id;
- }
-
- if ($curl = curl_init('https://api.clearlydefined.io/definitions')) {
- $options = array(
- CURLOPT_USERAGENT => "Eclipse Foundation",
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_FOLLOWLOCATION => true,
- CURLOPT_RETURNTRANSFER => 1,
- CURLOPT_POST => TRUE,
- CURLOPT_CONNECTTIMEOUT => 5,
- CURLOPT_TIMEOUT => 10,
- CURLOPT_POSTFIELDS => json_encode($unmatched),
- CURLOPT_HTTPHEADER => array(
- 'accept: application/json',
- 'Content-Type: application/json'
- )
- );
- curl_setopt_array($curl, $options);
- $contents = curl_exec($curl);
-
- if ($contents === false) {
- $results['errors'][] = "ClearlyDefined: " . curl_error($curl);
+ reset($results['unmatched']);
+ while (current($results['unmatched']) !== FALSE) {
+ $unmatched = array();
+ while (current($results['unmatched']) !== FALSE) {
+ $id = key($results['unmatched']);
+ next($results['unmatched']);
+ /*
+ * ClearlyDefined has a problem with resource types that it
+ * doesn't know about. Let's prune out the p2 ones.
+ */
+ if (preg_match('/^p2\//', $id)) continue;
+ $unmatched[] = $id;
+ if (count($unmatched) >= 1000) break;
}
- curl_close($curl);
- } else {
- $results['errors'][] = "ClearlyDefined: Could not initialize the session.";
- }
+ if ($curl = curl_init('https://api.clearlydefined.io/definitions')) {
+ $options = array(
+ CURLOPT_USERAGENT => "Eclipse Foundation",
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_FOLLOWLOCATION => true,
+ CURLOPT_RETURNTRANSFER => 1,
+ CURLOPT_POST => TRUE,
+ //CURLOPT_CONNECTTIMEOUT => 5,
+ //CURLOPT_TIMEOUT => 10,
+ CURLOPT_POSTFIELDS => json_encode($unmatched),
+ CURLOPT_HTTPHEADER => array(
+ 'accept: application/json',
+ 'Content-Type: application/json'
+ )
+ );
+ curl_setopt_array($curl, $options);
+ $contents = curl_exec($curl);
- // If we get a cUrl error, just bail out.
- if ($contents === false) return;
+ if ($contents === false) {
+ $results['errors'][] = "ClearlyDefined: " . curl_error($curl);
+ }
- $json = json_decode($contents, true);
+ curl_close($curl);
+ } else {
+ $results['errors'][] = "ClearlyDefined: Could not initialize the session.";
+ }
- if (isset($json['error'])) {
- $results['errors'][] = "ClearlyDefined: {$json['error']['message']}";
- return;
- }
+ // If we get a cUrl error, just bail out.
+ if ($contents === false) return;
- foreach($json as $id => $record) {
- // TODO Investigate why the record sometimes has the _id and sometimes does not.
- $metadata = new ClearlyDefinedMetadata($id, $record);
+ $json = json_decode($contents, true);
- $status = $licenses->isApproved($metadata->getLicense()) ? 'approved' : 'restricted';
- unset($results['unmatched'][$metadata->getId()]);
- $results[$status][$metadata->getId()] = array(
- 'id' => $metadata->getId(),
- 'license' => $metadata->getLicense(),
- 'status' => $status,
- 'sourceUrl' => $metadata->getSourceUrl(),
- 'definitionUrl' => $metadata->getDefinitionUrl(),
- 'authority' => 'clearlydefined',
- 'confidence' => $metadata->getScore()
- );
+ if (isset($json['error'])) {
+ $results['errors'][] = "ClearlyDefined: {$json['error']['message']}";
+ return;
+ }
+
+ foreach($json as $id => $record) {
+ // TODO Investigate why the record sometimes has the _id and sometimes does not.
+ $metadata = new ClearlyDefinedMetadata($id, $record);
+
+ $status = $licenses->isApproved($metadata->getLicense()) ? 'approved' : 'restricted';
+ unset($results['unmatched'][$metadata->getId()]);
+ $results[$status][$metadata->getId()] = array(
+ 'id' => $metadata->getId(),
+ 'license' => $metadata->getLicense(),
+ 'status' => $status,
+ 'sourceUrl' => $metadata->getSourceUrl(),
+ 'definitionUrl' => $metadata->getDefinitionUrl(),
+ 'authority' => 'clearlydefined',
+ 'confidence' => $metadata->getScore()
+ );
+ }
}
}
@@ -354,17 +413,20 @@
header ("Content-type: text/csv");
header ("Content-Disposition: \"inline; filename=licenses.csv\"");
-if (!isset($_POST['content'])) return;
-
require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
$App = new App ();
require_once dirname(__FILE__) . "/../classes/database.inc";
-$content = @$_POST['content'];
-
-if ($content) {
+if ($content = @$_POST['content']) {
$results = loadFromString($content);
+}
+
+if ($content = @$_POST['package-lock']) {
+ $results = loadFromPackageLockString($content);
+}
+
+if ($results) {
matchAgainstEclipseProjects($results);
matchAgainstFoundationData($results);
matchAgainstClearlyDefined($results);
@@ -372,6 +434,21 @@
$results = array();
}
-echo json_encode($results);
+// echo json_encode($results);
+
+$fp = fopen('php://output', 'w');
+foreach($results as $key => $values) {
+ if ($key == 'errors') {
+ foreach ($values as $error) {
+ echo "# {$error}\n";
+ }
+ } else {
+ foreach ($values as $id => $data) {
+ $row = array($id, $key, $data['license'], $data['authority'], $data['confidence']);
+ fputcsv($fp, $row);
+ }
+ }
+}
+fclose($fp);
?>