Add error handling.

Change-Id: Icf244900c17b4d47b19dfb3208dd7e7b5c61bf0e
diff --git a/services/license_check.php b/services/license_check.php
new file mode 100755
index 0000000..df6a36b
--- /dev/null
+++ b/services/license_check.php
@@ -0,0 +1,246 @@
+<?php
+/*******************************************************************************
+ * Copyright (c) 2019 Eclipse Foundation and others.
+ * This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License 2.0
+ * which accompanies this distribution, and is available at
+ * https://www.eclipse.org/legal/epl-2.0/
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ *******************************************************************************/
+/*
+ * This file is a prototype for executing a license scan of a bill of
+ * materials. The bill of materials is provided as plain text with each
+ * line containing the one unit of content (i.e., library). Blank lines
+ * and comment lines are skipped (comment lines start with "#" or "//").
+ *
+ * The id for a unit of content may be expressed as Maven coordinates of the form
+ * "groupid:artifactid[:packaging]:version", as abbridged Purl coordinates of the form
+ * "type/namespace/name@version", or as ClearlyDefined coordinates of the
+ * form "type/source/namespace/name/version". Mixing formats is supported.
+ *
+ * Output is expressed as ClearlyDefined ids.
+ *
+ * Two sources of information are used to map ids to license information and other
+ * metadata: first, the Eclipse Foundation data is consulted and then
+ * the ClearlyDefined services are called. Future versions of this script
+ * may consult other sources of data.
+ *
+ * usage example:
+ *
+ * curl -X POST http://localhost/projects/services/licenses.php \
+ * -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
+ *
+ * curl -X POST "http://localhost/projects/services/licenses.php" \
+ * -d "content=`mvn dependency:list -DskipTests -Dmaven.javadoc.skip=true | grep -Poh '\S+(?=:compile)' | sort | uniq`" | jsonpp | less
+ *
+ * yarn list | grep -Poh "(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)" \
+ * | curl -X POST "http://localhost/projects/services/licenses.php?XDEBUG_SESSION_START=ECLIPSE_DBGP" \
+ * --data-urlencode content@- | jsonpp | less
+ *
+ * Note that this works with an instance running on localhost.
+ *
+ * TODO Validate that we can scale to ~4K lines
+ * TODO support file upload.
+ * TODO Customize license list based on project (e.g., science.* can use LGPL)
+ * TODO Parameters for default type and provider.
+ */
+
+/**
+ * Try to massage the content identifier into ClearlyDefined coordinates. That is,
+ * for example, recognize Maven coordinates of the form <em>groupid:artifactid:version</em>, and
+ * convert them into the ClearlyDefined equivalent, <em>maven:mavencentral:groupid:artifactid:version</em>.
+ *
+ * Answers <code>null<code> when the id is not recognized.
+ *
+ * @param string $id
+ * @return NULL|string
+ */
+function normalizeId($id) {
+	$matches = null;
+
+	// Just pass through anything that's already in a ClearlyDefined coordinate form.
+	if (preg_match('/([\w@\-.]+)(?:\/[\w@\-.]+){4}/', $id, $matches)) return $matches[0];
+
+	/*
+	 * Deal with Maven coordinates. There is a special case that we need to deal with
+	 * because of Tycho using p2 repositories. When the coordinates start with
+	 * "p2.eclipse-plugin" or "p2.eclipse-feature", we generate an id for p2/orbit.
+	 */
+	if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches)) {
+		if (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $matches[1])) {
+			return "p2/orbit/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+		}
+		return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+	}
+	/*
+	 * Some p2 cases (org.apache.ant only, I think) provide all of the information that
+	 * we need to match against Maven directly.
+	 *
+	 * e.g. "p2.eclipse-plugin:org.apache.ant:jar:lib/ant-jsch.jar:1.10.5.v20190526-1402"
+	 * maps to "maven/mavencentral/org.apache.ant/ant-jsch/1.10.5"
+	 */
+	if (preg_match('/p2.eclipse-plugin:([\w@\-.]+)(?::[\w@\-.]+)?:lib\/([\w@\-.]+).jar:(\d+(?:\.\d+)*)/', $id, $matches)) {
+		return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
+	}
+
+	/*
+	 * Deal with pURL coordinates. These take the form "namespace/name@version",
+	 * e.g., "@webassemblyjs/wast-printer@1.7.8". The namespace is optional; when
+	 * absent, we use a dash ("-").
+	 *
+	 * I was surprised to see range qualifiers with some of the versions provided
+	 * from yarn (<code>yarn list</code>). For now, we just ignore them.
+	 *
+	 * FIXME Sort out what to do with the range qualifiers
+	 *
+	 * FIXME Don't assume that values provided in pURL format are NPM.
+	 */
+	if (preg_match('/(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)/', $id, $matches)) {
+		$namespace = empty($matches[1]) ? '-' : $matches[1];
+		return "npm/npmjs/{$namespace}/{$matches[2]}/{$matches[3]}";
+	}
+
+	return null;
+}
+
+function loadFromString($content) {
+	$stream = fopen('php://memory','w+');
+	fwrite($stream, $content);
+	rewind($stream);
+
+	$results = array();
+
+	while ($line = trim(fgets($stream))) {
+		if (empty($line)) break;
+		if (preg_match('/$#', $line)) break;
+		if (preg_match('/$\/\//', $line)) break;
+
+		if ($id = normalizeId(trim($line))) {
+			$results['unmatched'][$id] = array();
+		} else {
+			$results['invalid'][] = $line;
+		}
+	}
+
+	fclose($stream);
+	return $results;
+}
+
+function loadFromPackageLockString($content) {
+	$json = json_decode($content, true);
+	$results = array();
+	visitPackages($json, function($id) use (&$results) {
+		$results['unmatched'][$id] = array();
+	});
+	return $results;
+}
+
+function visitPackages($root, Callable $callback) {
+	if (!isset($root['dependencies'])) return;
+
+	foreach($root['dependencies'] as $name => $data) {
+		$matches = null;
+		if (preg_match('/^(.+)\/(.+)$/', $name, $matches)) {
+			$namespace = $matches[1];
+			$name = $matches[2];
+		} else {
+			$namespace = '-';
+			$name = $name;
+		}
+		$id = "npm/npmjs/{$namespace}/{$name}/{$data['version']}";
+		call_user_func($callback, $id);
+		visitPackages($data, $callback);
+	}
+}
+
+function matchAgainstEclipseProjects(&$results) {
+	foreach(array_keys($results['unmatched']) as $id) {
+		if ($parts = preg_split('/\//', $id)) {
+			if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[2])) {
+				unset($results['unmatched'][$id]);
+				$results['approved'][$id] = array(
+						'id' => $id,
+						'license' => '',
+						'status' => 'approved',
+						'sourceUrl' => '',
+						'definitionUrl' => '',
+						'authority' => 'eclipse',
+						'confidence' => 90
+				);
+			} elseif (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $parts[2])) {
+				if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[3])) {
+					unset($results['unmatched'][$id]);
+					$results['approved'][$id] = array(
+							'id' => $id,
+							'license' => '',
+							'status' => 'approved',
+							'sourceUrl' => '',
+							'definitionUrl' => '',
+							'authority' => 'eclipse',
+							'confidence' => 90
+					);
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Match against the consolidated data from Eclipse Foundation
+ * sources. The consolidated data is stored in the dashboard
+ * database by a script that runs periodically.
+ *
+ * This function modifies the parameter. As we find matches, the id
+ * is removed from the "unmatched" set and the metadata that we do
+ * find it added to an array by status.
+ *
+ * @see project-services/capture/php/import_third_party_license_data.php
+ */
+function matchAgainstFoundationData(&$results) {
+	foreach(array_keys($results['unmatched']) as $id) {
+		$sql = 'select
+				id, license, status, sourceUrl, definitionUrl, authority, confidence
+			from ThirdPartyLicense
+			where id=":id"';
+		$args = array(':id' => $id);
+		query('dashboard', $sql, $args, function($row) use (&$results) {
+			unset($results['unmatched'][$row['id']]);
+			$results[$row['status']][$row['id']] = $row;
+		});
+	}
+}
+
+
+// Everything above this line could (and probably should) be factored out.
+
+header ("Content-type: text/csv");
+header ("Content-Disposition: \"inline; filename=licenses.csv\"");
+
+require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
+$App = new App ();
+
+require_once dirname(__FILE__) . "/../classes/database.inc";
+
+if ($content = @$_POST['content']) {
+	$results = loadFromString($content);
+}
+
+if ($content = @$_POST['package-lock']) {
+	$results = loadFromPackageLockString($content);
+}
+
+if ($content = @$_POST['json']) {
+	$results = array('unmatched' => array_fill_keys(json_decode($content, true),array()));
+}
+
+if ($results) {
+	matchAgainstEclipseProjects($results);
+	matchAgainstFoundationData($results);
+} else {
+	$results = array();
+}
+
+echo json_encode($results);
+
+?>
diff --git a/services/licenses.php b/services/licenses.php
index 27726bc..c81e398 100755
--- a/services/licenses.php
+++ b/services/licenses.php
@@ -31,10 +31,19 @@
  * curl -X POST http://localhost/projects/services/licenses.php \
  * -d $'content=npm/npmjs/@theia/variable-resolver/0.3.19\nnpm/npmjs/@theia/outline-view/0.3.19'
  *
+ * curl -X POST "http://localhost/projects/services/licenses.php" \
+ * -d "content=`mvn dependency:list -DskipTests -Dmaven.javadoc.skip=true | grep -Poh '\S+(?=:compile)' | sort | uniq`" | jsonpp | less
+ *
+ * yarn list | grep -Poh "(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)" \
+ * | curl -X POST "http://localhost/projects/services/licenses.php?XDEBUG_SESSION_START=ECLIPSE_DBGP" \
+ * --data-urlencode content@- | jsonpp | less
+ *
  * Note that this works with an instance running on localhost.
  *
  * TODO Validate that we can scale to ~4K lines
  * TODO support file upload.
+ * TODO Customize license list based on project (e.g., science.* can use LGPL)
+ * TODO Parameters for default type and provider.
  */
 
 /**
@@ -58,7 +67,7 @@
 	 * because of Tycho using p2 repositories. When the coordinates start with
 	 * "p2.eclipse-plugin" or "p2.eclipse-feature", we generate an id for p2/orbit.
 	 */
-	if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+)?:(\d+(?:\.\d+)*)/', $id, $matches)) {
+	if (preg_match('/([\w@\-.]+):([\w@\-.]+)(?::[\w@\-.]+){0,2}:(\d+(?:\.\d+)*)/', $id, $matches)) {
 		if (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $matches[1])) {
 			return "p2/orbit/{$matches[1]}/{$matches[2]}/{$matches[3]}";
 		}
@@ -74,7 +83,24 @@
 	if (preg_match('/p2.eclipse-plugin:([\w@\-.]+)(?::[\w@\-.]+)?:lib\/([\w@\-.]+).jar:(\d+(?:\.\d+)*)/', $id, $matches)) {
 		return "maven/mavencentral/{$matches[1]}/{$matches[2]}/{$matches[3]}";
 	}
-	// TODO add Purl coordinates
+
+	/*
+	 * Deal with pURL coordinates. These take the form "namespace/name@version",
+	 * e.g., "@webassemblyjs/wast-printer@1.7.8". The namespace is optional; when
+	 * absent, we use a dash ("-").
+	 *
+	 * I was surprised to see range qualifiers with some of the versions provided
+	 * from yarn (<code>yarn list</code>). For now, we just ignore them.
+	 *
+	 * FIXME Sort out what to do with the range qualifiers
+	 *
+	 * FIXME Don't assume that values provided in pURL format are NPM.
+	 */
+	if (preg_match('/(?:([^\/\s]+)\/)?([^\/\s]+)@\D*(\d+(?:\.\d+)*)/', $id, $matches)) {
+		$namespace = empty($matches[1]) ? '-' : $matches[1];
+		return "npm/npmjs/{$namespace}/{$matches[2]}/{$matches[3]}";
+	}
+
 	return null;
 }
 
@@ -85,15 +111,15 @@
 
 	$results = array();
 
-	while ($line = fgets($stream)) {
-		if (empty(trim($line))) break;
+	while ($line = trim(fgets($stream))) {
+		if (empty($line)) break;
 		if (preg_match('/$#', $line)) break;
 		if (preg_match('/$\/\//', $line)) break;
 
 		if ($id = normalizeId(trim($line))) {
 			$results['unmatched'][$id] = array();
 		} else {
-			$results['invalid'][] = $line;
+			$results['invalid'][$line] = array();
 		}
 	}
 
@@ -101,10 +127,37 @@
 	return $results;
 }
 
+function loadFromPackageLockString($content) {
+	$json = json_decode($content, true);
+	$results = array();
+	visitPackages($json, function($id) use (&$results) {
+		$results['unmatched'][$id] = array();
+	});
+	return $results;
+}
+
+function visitPackages($root, Callable $callback) {
+	if (!isset($root['dependencies'])) return;
+
+	foreach($root['dependencies'] as $name => $data) {
+		$matches = null;
+		if (preg_match('/^(.+)\/(.+)$/', $name, $matches)) {
+			$namespace = $matches[1];
+			$name = $matches[2];
+		} else {
+			$namespace = '-';
+			$name = $name;
+		}
+		$id = "npm/npmjs/{$namespace}/{$name}/{$data['version']}";
+		call_user_func($callback, $id);
+		visitPackages($data, $callback);
+	}
+}
+
 function matchAgainstEclipseProjects(&$results) {
 	foreach(array_keys($results['unmatched']) as $id) {
 		if ($parts = preg_split('/\//', $id)) {
-			if (preg_match('/^org\.eclipse/', $parts[2])) {
+			if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[2])) {
 				unset($results['unmatched'][$id]);
 				$results['approved'][$id] = array(
 						'id' => $id,
@@ -116,7 +169,7 @@
 						'confidence' => 90
 				);
 			} elseif (preg_match('/^p2.eclipse-(?:plugin|feature)$/', $parts[2])) {
-				if (preg_match('/^org\.eclipse/', $parts[3])) {
+				if (preg_match('/^org\.(?:eclipse|polarsys|locationtech)/', $parts[3])) {
 					unset($results['unmatched'][$id]);
 					$results['approved'][$id] = array(
 							'id' => $id,
@@ -284,68 +337,74 @@
 function matchAgainstClearlyDefined(&$results) {
 	$licenses = Licenses::load();
 
-	$unmatched = array();
-	foreach($results['unmatched'] as $id => $ignore) {
-		/*
-		 * ClearlyDefined has a problem with resource types that it
-		 * doesn't know about. Let's prune out the p2 ones.
-		 */
-		if (preg_match('/^p2\//', $id)) continue;
-		$unmatched[] = $id;
-	}
-
-	if ($curl = curl_init('https://api.clearlydefined.io/definitions')) {
-		$options = array(
-				CURLOPT_USERAGENT => "Eclipse Foundation",
-				CURLOPT_RETURNTRANSFER => true,
-				CURLOPT_FOLLOWLOCATION => true,
-				CURLOPT_RETURNTRANSFER => 1,
-				CURLOPT_POST => TRUE,
-				CURLOPT_CONNECTTIMEOUT => 5,
-				CURLOPT_TIMEOUT => 10,
-				CURLOPT_POSTFIELDS => json_encode($unmatched),
-				CURLOPT_HTTPHEADER => array(
-						'accept: application/json',
-						'Content-Type: application/json'
-				)
-		);
-		curl_setopt_array($curl, $options);
-		$contents = curl_exec($curl);
-
-		if ($contents === false) {
-			$results['errors'][] = "ClearlyDefined: " . curl_error($curl);
+	reset($results['unmatched']);
+	while (current($results['unmatched']) !== FALSE) {
+		$unmatched = array();
+		while (current($results['unmatched']) !== FALSE) {
+			$id = key($results['unmatched']);
+			next($results['unmatched']);
+			/*
+			 * ClearlyDefined has a problem with resource types that it
+			 * doesn't know about. Let's prune out the p2 ones.
+			 */
+			if (preg_match('/^p2\//', $id)) continue;
+			$unmatched[] = $id;
+			if (count($unmatched) >= 1000) break;
 		}
 
-		curl_close($curl);
-	} else {
-		$results['errors'][] = "ClearlyDefined: Could not initialize the session.";
-	}
+		if ($curl = curl_init('https://api.clearlydefined.io/definitions')) {
+			$options = array(
+					CURLOPT_USERAGENT => "Eclipse Foundation",
+					CURLOPT_RETURNTRANSFER => true,
+					CURLOPT_FOLLOWLOCATION => true,
+					CURLOPT_RETURNTRANSFER => 1,
+					CURLOPT_POST => TRUE,
+					//CURLOPT_CONNECTTIMEOUT => 5,
+					//CURLOPT_TIMEOUT => 10,
+					CURLOPT_POSTFIELDS => json_encode($unmatched),
+					CURLOPT_HTTPHEADER => array(
+							'accept: application/json',
+							'Content-Type: application/json'
+					)
+			);
+			curl_setopt_array($curl, $options);
+			$contents = curl_exec($curl);
 
-	// If we get a cUrl error, just bail out.
-	if ($contents === false) return;
+			if ($contents === false) {
+				$results['errors'][] = "ClearlyDefined: " . curl_error($curl);
+			}
 
-	$json = json_decode($contents, true);
+			curl_close($curl);
+		} else {
+			$results['errors'][] = "ClearlyDefined: Could not initialize the session.";
+		}
 
-	if (isset($json['error'])) {
-		$results['errors'][] = "ClearlyDefined: {$json['error']['message']}";
-		return;
-	}
+		// If we get a cUrl error, just bail out.
+		if ($contents === false) return;
 
-	foreach($json as $id => $record) {
-		// TODO Investigate why the record sometimes has the _id and sometimes does not.
-		$metadata = new ClearlyDefinedMetadata($id, $record);
+		$json = json_decode($contents, true);
 
-		$status = $licenses->isApproved($metadata->getLicense()) ? 'approved' : 'restricted';
-		unset($results['unmatched'][$metadata->getId()]);
-		$results[$status][$metadata->getId()] = array(
-				'id' => $metadata->getId(),
-				'license' => $metadata->getLicense(),
-				'status' => $status,
-				'sourceUrl' => $metadata->getSourceUrl(),
-				'definitionUrl' => $metadata->getDefinitionUrl(),
-				'authority' => 'clearlydefined',
-				'confidence' => $metadata->getScore()
-		);
+		if (isset($json['error'])) {
+			$results['errors'][] = "ClearlyDefined: {$json['error']['message']}";
+			return;
+		}
+
+		foreach($json as $id => $record) {
+			// TODO Investigate why the record sometimes has the _id and sometimes does not.
+			$metadata = new ClearlyDefinedMetadata($id, $record);
+
+			$status = $licenses->isApproved($metadata->getLicense()) ? 'approved' : 'restricted';
+			unset($results['unmatched'][$metadata->getId()]);
+			$results[$status][$metadata->getId()] = array(
+					'id' => $metadata->getId(),
+					'license' => $metadata->getLicense(),
+					'status' => $status,
+					'sourceUrl' => $metadata->getSourceUrl(),
+					'definitionUrl' => $metadata->getDefinitionUrl(),
+					'authority' => 'clearlydefined',
+					'confidence' => $metadata->getScore()
+			);
+		}
 	}
 }
 
@@ -354,17 +413,20 @@
 header ("Content-type: text/csv");
 header ("Content-Disposition: \"inline; filename=licenses.csv\"");
 
-if (!isset($_POST['content'])) return;
-
 require_once (dirname ( __FILE__ ) . "/../../eclipse.org-common/system/app.class.php");
 $App = new App ();
 
 require_once dirname(__FILE__) . "/../classes/database.inc";
 
-$content = @$_POST['content'];
-
-if ($content) {
+if ($content = @$_POST['content']) {
 	$results = loadFromString($content);
+}
+
+if ($content = @$_POST['package-lock']) {
+	$results = loadFromPackageLockString($content);
+}
+
+if ($results) {
 	matchAgainstEclipseProjects($results);
 	matchAgainstFoundationData($results);
 	matchAgainstClearlyDefined($results);
@@ -372,6 +434,21 @@
 	$results = array();
 }
 
-echo json_encode($results);
+// echo json_encode($results);
+
+$fp = fopen('php://output', 'w');
+foreach($results as $key => $values) {
+	if ($key == 'errors') {
+		foreach ($values as $error) {
+			echo "# {$error}\n";
+		}
+	} else {
+		foreach ($values as $id => $data) {
+			$row = array($id, $key, $data['license'], $data['authority'], $data['confidence']);
+			fputcsv($fp, $row);
+		}
+	}
+}
+fclose($fp);
 
 ?>