-
-
Save msaari/a0f3c20c5f7e60b750233802264688e4 to your computer and use it in GitHub Desktop.
| <?php | |
| /** | |
| * Attachment processing intermediary to work between Relevanssi and a Tika server. | |
| * | |
| * Installation instructions: | |
| * 1. Save this as index.php. | |
| * 2. Change the Tika server URL in the constructor to point to your own Tika server. | |
| * 3. Upload this file in a directory on your server. | |
| * | |
| * @author Mikko Saari (mikko@mikkosaari.fi) | |
| * @license GNU General Public License 3.0 or later | |
| * @see https://www.relevanssi.com/user-manual/attachment-server/ | |
| */ | |
| $PdfProcessor = new PdfController(); | |
| if ( isset( $_GET['upload'] ) ) { | |
| $PdfProcessor->process( $_GET ); | |
| } else { | |
| $PdfProcessor->process( $_POST ); | |
| } | |
| class PdfController { | |
| private $tmp_path; | |
| private $tika_server; | |
| public function __construct() { | |
| $this->tmp_path = '/tmp/'; | |
| $this->tika_server = 'http://www.example.com:9998/tika'; // URL to the Tika server, include the port number and the path /tika | |
| } | |
| private function getTempPath() { | |
| return $this->tmp_path; | |
| } | |
| private function get_tika_server() { | |
| return $this->tika_server; | |
| } | |
| private function createTempFile($type) { | |
| return tempnam($this->getTempPath(), $type . "_") . "." . $type; | |
| } | |
| /** | |
| * Takes the temp file, sends it to Tika, returns the results. | |
| */ | |
| private function processTempFile($tempfile) { | |
| if ( filesize( $tempfile ) === 0 ) { | |
| $last_error = error_get_last(); | |
| $this->returnError( 'Empty attachment file. Is the file publicly available? Server error: ' . $last_error['message'] ); | |
| } | |
| $text = null; | |
| $ch = curl_init( $this->get_tika_server() ); | |
| curl_setopt( $ch, CURLOPT_PUT, 1 ); | |
| $fh_res = fopen( $tempfile, 'r' ); | |
| curl_setopt( $ch, CURLOPT_INFILE, $fh_res ); | |
| curl_setopt( $ch, CURLOPT_INFILESIZE, filesize( $tempfile ) ); | |
| curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 ); | |
| curl_setopt( $ch, CURLOPT_HTTPHEADER, [ 'Accept: text/plain' ] ); | |
| $curl_response_res = curl_exec ( $ch ); | |
| fclose( $fh_res ); | |
| $text = $curl_response_res; | |
| unlink($tempfile); | |
| if ( empty( $text ) ) { | |
| $this->returnError( 'No text in the file.' ); | |
| } | |
| $json = json_encode($text); | |
| return $json; | |
| } | |
| /** | |
| * Fetches the attachment file, saves it in a temp file and calls | |
| * processTempFile() for processing. | |
| */ | |
| public function processPDF($url = null) { | |
| $url_parts = parse_url( $url ); | |
| $path_parts = explode( '/', $url_parts['path'] ); | |
| $filename = array_pop( $path_parts ); | |
| $filename_parts = explode( '.', $filename ); | |
| $extension = array_pop( $filename_parts ); | |
| if ( rawurlencode( urldecode( $filename ) ) !== $filename ) { | |
| array_push( $path_parts, rawurlencode( $filename ) ); | |
| } else { | |
| array_push( $path_parts, $filename ); | |
| } | |
| $url_parts['path'] = '/' . rawurlencode( substr( $url_parts['path'], 1 ) ); | |
| $url_parts['query'] = isset( $url_parts['query'] ) ? rawurlencode( $url_parts['query'] ) : ''; | |
| $url_parts['fragment'] = isset( $url_parts['fragment'] ) ? rawurlencode( $url_parts['fragment'] ) : ''; | |
| $url = http_build_url( $url_parts ); | |
| if (filter_var($url, FILTER_VALIDATE_URL) === false) { | |
| $this->returnError("Not a valid URL."); | |
| } | |
| $tempfile = $this->createTempFile($extension); | |
| if ( '.' . $extension === $tempfile ) { | |
| $this->returnError( 'Could not access the PDF file.' ); | |
| } | |
| file_put_contents($tempfile, fopen($url, 'r')); | |
| return $this->processTempFile($tempfile); | |
| } | |
| /** | |
| * Takes the uploaded file, saves it in a temp file and calls | |
| * processTempFile() for processing. | |
| */ | |
| public function processUploadedPDF($upload = null) { | |
| $pdf_content = file_get_contents('php://input'); | |
| $tempfile = $this->createTempFile("pdf"); | |
| if ( '.pdf' === $tempfile ) { | |
| $this->returnError( 'Could not access the PDF file.' ); | |
| } | |
| file_put_contents($tempfile, $pdf_content); | |
| return $this->processTempFile($tempfile); | |
| } | |
| private function returnError($msg) { | |
| error_log($msg); | |
| header('HTTP/1.0 500 Internal Server Error'); | |
| die(json_encode(array('error' => "PDF Processor error: " . $msg))); | |
| } | |
| /** | |
| * Starts the processing and calls the right processing function. | |
| */ | |
| public function process($data) { | |
| if (empty($data)) { | |
| echo "Relevanssi attachment handling services is up and running."; | |
| exit(); | |
| } | |
| /* | |
| if (!isset($data['key'])) { | |
| $this->returnError("Key is missing."); | |
| } | |
| if (!$this->isValidKey($data['key'])) { | |
| $this->returnError("Key " . $data['key'] . " is not valid."); | |
| } | |
| */ | |
| if (isset($data['url'])) { | |
| $text = $this->processPDF($data['url']); | |
| die($text); | |
| } | |
| if (isset($data['upload'])) { | |
| $text = $this->processUploadedPDF($data['upload']); | |
| die($text); | |
| } | |
| $this->returnError("No action selected."); | |
| } | |
| } | |
| if (!defined('HTTP_URL_REPLACE')) { | |
| define('HTTP_URL_REPLACE', 1); | |
| } | |
| if (!defined('HTTP_URL_JOIN_PATH')) { | |
| define('HTTP_URL_JOIN_PATH', 2); | |
| } | |
| if (!defined('HTTP_URL_JOIN_QUERY')) { | |
| define('HTTP_URL_JOIN_QUERY', 4); | |
| } | |
| if (!defined('HTTP_URL_STRIP_USER')) { | |
| define('HTTP_URL_STRIP_USER', 8); | |
| } | |
| if (!defined('HTTP_URL_STRIP_PASS')) { | |
| define('HTTP_URL_STRIP_PASS', 16); | |
| } | |
| if (!defined('HTTP_URL_STRIP_AUTH')) { | |
| define('HTTP_URL_STRIP_AUTH', 32); | |
| } | |
| if (!defined('HTTP_URL_STRIP_PORT')) { | |
| define('HTTP_URL_STRIP_PORT', 64); | |
| } | |
| if (!defined('HTTP_URL_STRIP_PATH')) { | |
| define('HTTP_URL_STRIP_PATH', 128); | |
| } | |
| if (!defined('HTTP_URL_STRIP_QUERY')) { | |
| define('HTTP_URL_STRIP_QUERY', 256); | |
| } | |
| if (!defined('HTTP_URL_STRIP_FRAGMENT')) { | |
| define('HTTP_URL_STRIP_FRAGMENT', 512); | |
| } | |
| if (!defined('HTTP_URL_STRIP_ALL')) { | |
| define('HTTP_URL_STRIP_ALL', 1024); | |
| } | |
| if (!function_exists('http_build_url')) { | |
| /** | |
| * Build a URL. | |
| * | |
| * The parts of the second URL will be merged into the first according to | |
| * the flags argument. | |
| * | |
| * @param mixed $url (part(s) of) an URL in form of a string or | |
| * associative array like parse_url() returns | |
| * @param mixed $parts same as the first argument | |
| * @param int $flags a bitmask of binary or'ed HTTP_URL constants; | |
| * HTTP_URL_REPLACE is the default | |
| * @param array $new_url if set, it will be filled with the parts of the | |
| * composed url like parse_url() would return | |
| * @return string | |
| */ | |
| function http_build_url($url, $parts = array(), $flags = HTTP_URL_REPLACE, &$new_url = array()) | |
| { | |
| is_array($url) || $url = parse_url($url); | |
| is_array($parts) || $parts = parse_url($parts); | |
| isset($url['query']) && is_string($url['query']) || $url['query'] = null; | |
| isset($parts['query']) && is_string($parts['query']) || $parts['query'] = null; | |
| $keys = array('user', 'pass', 'port', 'path', 'query', 'fragment'); | |
| // HTTP_URL_STRIP_ALL and HTTP_URL_STRIP_AUTH cover several other flags. | |
| if ($flags & HTTP_URL_STRIP_ALL) { | |
| $flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS | |
| | HTTP_URL_STRIP_PORT | HTTP_URL_STRIP_PATH | |
| | HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT; | |
| } elseif ($flags & HTTP_URL_STRIP_AUTH) { | |
| $flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS; | |
| } | |
| // Schema and host are alwasy replaced | |
| foreach (array('scheme', 'host') as $part) { | |
| if (isset($parts[$part])) { | |
| $url[$part] = $parts[$part]; | |
| } | |
| } | |
| if ($flags & HTTP_URL_REPLACE) { | |
| foreach ($keys as $key) { | |
| if (isset($parts[$key])) { | |
| $url[$key] = $parts[$key]; | |
| } | |
| } | |
| } else { | |
| if (isset($parts['path']) && ($flags & HTTP_URL_JOIN_PATH)) { | |
| if (isset($url['path']) && substr($parts['path'], 0, 1) !== '/') { | |
| // Workaround for trailing slashes | |
| $url['path'] .= 'a'; | |
| $url['path'] = rtrim( | |
| str_replace(basename($url['path']), '', $url['path']), | |
| '/' | |
| ) . '/' . ltrim($parts['path'], '/'); | |
| } else { | |
| $url['path'] = $parts['path']; | |
| } | |
| } | |
| if (isset($parts['query']) && ($flags & HTTP_URL_JOIN_QUERY)) { | |
| if (isset($url['query'])) { | |
| parse_str($url['query'], $url_query); | |
| parse_str($parts['query'], $parts_query); | |
| $url['query'] = http_build_query( | |
| array_replace_recursive( | |
| $url_query, | |
| $parts_query | |
| ) | |
| ); | |
| } else { | |
| $url['query'] = $parts['query']; | |
| } | |
| } | |
| } | |
| if (isset($url['path']) && $url['path'] !== '' && substr($url['path'], 0, 1) !== '/') { | |
| $url['path'] = '/' . $url['path']; | |
| } | |
| foreach ($keys as $key) { | |
| $strip = 'HTTP_URL_STRIP_' . strtoupper($key); | |
| if ($flags & constant($strip)) { | |
| unset($url[$key]); | |
| } | |
| } | |
| $parsed_string = ''; | |
| if (!empty($url['scheme'])) { | |
| $parsed_string .= $url['scheme'] . '://'; | |
| } | |
| if (!empty($url['user'])) { | |
| $parsed_string .= $url['user']; | |
| if (isset($url['pass'])) { | |
| $parsed_string .= ':' . $url['pass']; | |
| } | |
| $parsed_string .= '@'; | |
| } | |
| if (!empty($url['host'])) { | |
| $parsed_string .= $url['host']; | |
| } | |
| if (!empty($url['port'])) { | |
| $parsed_string .= ':' . $url['port']; | |
| } | |
| if (!empty($url['path'])) { | |
| $parsed_string .= $url['path']; | |
| } | |
| if (!empty($url['query'])) { | |
| $parsed_string .= '?' . $url['query']; | |
| } | |
| if (!empty($url['fragment'])) { | |
| $parsed_string .= '#' . $url['fragment']; | |
| } | |
| $new_url = $url; | |
| return $parsed_string; | |
| } | |
| } |
I noticed that this PHP file only requests meta information from Tika Server but not the actual document text. This causes Relevanssi search to not work properly.
For Tika Server 2.7.0.0 following line needs to be changed:
$this->tika_server = 'http://www.example.com:9998/meta';
to
$this->tika_server = 'http://www.example.com:9998/tika';
The endpoint must be /tika not /meta
Then, in function processTempFile, add following line somewhere between the curl_setopt lines:
curl_setopt( $ch, CURLOPT_HTTPHEADER, [ 'Accept: text/plain' ] );
With these changes in place, Tika Server will only return the content of the documents as plaintext and then Relevanssi Search will work.
In my wordpress setup, the intermediary script complains about missing http_build_url() function, installing PECL doesn't really help.
@acesuares As far as I can tell, http_build_query() is a standard PHP function that shouldn't require any modules installed. Which PHP version is your server running?
This Stack Overflow question has a replacement function you can use.
Its _url, not _query
oh yeah, the stackoverflow post, makes urls with %2F instead of / and somehow those urls don't work. If i replace all the %2F with / manually the urls do work
Thank you!