Last active
December 23, 2025 03:55
-
-
Save msaari/a0f3c20c5f7e60b750233802264688e4 to your computer and use it in GitHub Desktop.
Relevanssi attachment indexing server intermediary script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Attachment processing intermediary to work between Relevanssi and a Tika server. | |
| * | |
| * Installation instructions: | |
| * 1. Save this as index.php. | |
| * 2. Change the Tika server URL in the constructor to point to your own Tika server. | |
| * 3. Upload this file in a directory on your server. | |
| * | |
| * @author Mikko Saari (mikko@mikkosaari.fi) | |
| * @license GNU General Public License 3.0 or later | |
| * @see https://www.relevanssi.com/user-manual/attachment-server/ | |
| */ | |
| $PdfProcessor = new PdfController(); | |
| if ( isset( $_GET['upload'] ) ) { | |
| $PdfProcessor->process( $_GET ); | |
| } else { | |
| $PdfProcessor->process( $_POST ); | |
| } | |
| class PdfController { | |
| private $tmp_path; | |
| private $tika_server; | |
| public function __construct() { | |
| $this->tmp_path = '/tmp/'; | |
| $this->tika_server = 'http://www.example.com:9998/tika'; // URL to the Tika server, include the port number and the path /tika | |
| } | |
| private function getTempPath() { | |
| return $this->tmp_path; | |
| } | |
| private function get_tika_server() { | |
| return $this->tika_server; | |
| } | |
| private function createTempFile($type) { | |
| return tempnam($this->getTempPath(), $type . "_") . "." . $type; | |
| } | |
| /** | |
| * Takes the temp file, sends it to Tika, returns the results. | |
| */ | |
| private function processTempFile($tempfile) { | |
| if ( filesize( $tempfile ) === 0 ) { | |
| $last_error = error_get_last(); | |
| $this->returnError( 'Empty attachment file. Is the file publicly available? Server error: ' . $last_error['message'] ); | |
| } | |
| $text = null; | |
| $ch = curl_init( $this->get_tika_server() ); | |
| curl_setopt( $ch, CURLOPT_PUT, 1 ); | |
| $fh_res = fopen( $tempfile, 'r' ); | |
| curl_setopt( $ch, CURLOPT_INFILE, $fh_res ); | |
| curl_setopt( $ch, CURLOPT_INFILESIZE, filesize( $tempfile ) ); | |
| curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 ); | |
| curl_setopt( $ch, CURLOPT_HTTPHEADER, [ 'Accept: text/plain' ] ); | |
| $curl_response_res = curl_exec ( $ch ); | |
| fclose( $fh_res ); | |
| $text = $curl_response_res; | |
| unlink($tempfile); | |
| if ( empty( $text ) ) { | |
| $this->returnError( 'No text in the file.' ); | |
| } | |
| $json = json_encode($text); | |
| return $json; | |
| } | |
| /** | |
| * Fetches the attachment file, saves it in a temp file and calls | |
| * processTempFile() for processing. | |
| */ | |
| public function processPDF($url = null) { | |
| $url_parts = parse_url( $url ); | |
| $path_parts = explode( '/', $url_parts['path'] ); | |
| $filename = array_pop( $path_parts ); | |
| $filename_parts = explode( '.', $filename ); | |
| $extension = array_pop( $filename_parts ); | |
| if ( rawurlencode( urldecode( $filename ) ) !== $filename ) { | |
| array_push( $path_parts, rawurlencode( $filename ) ); | |
| } else { | |
| array_push( $path_parts, $filename ); | |
| } | |
| $url_parts['path'] = '/' . rawurlencode( substr( $url_parts['path'], 1 ) ); | |
| $url_parts['query'] = isset( $url_parts['query'] ) ? rawurlencode( $url_parts['query'] ) : ''; | |
| $url_parts['fragment'] = isset( $url_parts['fragment'] ) ? rawurlencode( $url_parts['fragment'] ) : ''; | |
| $url = http_build_url( $url_parts ); | |
| if (filter_var($url, FILTER_VALIDATE_URL) === false) { | |
| $this->returnError("Not a valid URL."); | |
| } | |
| $tempfile = $this->createTempFile($extension); | |
| if ( '.' . $extension === $tempfile ) { | |
| $this->returnError( 'Could not access the PDF file.' ); | |
| } | |
| file_put_contents($tempfile, fopen($url, 'r')); | |
| return $this->processTempFile($tempfile); | |
| } | |
| /** | |
| * Takes the uploaded file, saves it in a temp file and calls | |
| * processTempFile() for processing. | |
| */ | |
| public function processUploadedPDF($upload = null) { | |
| $pdf_content = file_get_contents('php://input'); | |
| $tempfile = $this->createTempFile("pdf"); | |
| if ( '.pdf' === $tempfile ) { | |
| $this->returnError( 'Could not access the PDF file.' ); | |
| } | |
| file_put_contents($tempfile, $pdf_content); | |
| return $this->processTempFile($tempfile); | |
| } | |
| private function returnError($msg) { | |
| error_log($msg); | |
| header('HTTP/1.0 500 Internal Server Error'); | |
| die(json_encode(array('error' => "PDF Processor error: " . $msg))); | |
| } | |
| /** | |
| * Starts the processing and calls the right processing function. | |
| */ | |
| public function process($data) { | |
| if (empty($data)) { | |
| echo "Relevanssi attachment handling services is up and running."; | |
| exit(); | |
| } | |
| /* | |
| if (!isset($data['key'])) { | |
| $this->returnError("Key is missing."); | |
| } | |
| if (!$this->isValidKey($data['key'])) { | |
| $this->returnError("Key " . $data['key'] . " is not valid."); | |
| } | |
| */ | |
| if (isset($data['url'])) { | |
| $text = $this->processPDF($data['url']); | |
| die($text); | |
| } | |
| if (isset($data['upload'])) { | |
| $text = $this->processUploadedPDF($data['upload']); | |
| die($text); | |
| } | |
| $this->returnError("No action selected."); | |
| } | |
| } | |
| if (!defined('HTTP_URL_REPLACE')) { | |
| define('HTTP_URL_REPLACE', 1); | |
| } | |
| if (!defined('HTTP_URL_JOIN_PATH')) { | |
| define('HTTP_URL_JOIN_PATH', 2); | |
| } | |
| if (!defined('HTTP_URL_JOIN_QUERY')) { | |
| define('HTTP_URL_JOIN_QUERY', 4); | |
| } | |
| if (!defined('HTTP_URL_STRIP_USER')) { | |
| define('HTTP_URL_STRIP_USER', 8); | |
| } | |
| if (!defined('HTTP_URL_STRIP_PASS')) { | |
| define('HTTP_URL_STRIP_PASS', 16); | |
| } | |
| if (!defined('HTTP_URL_STRIP_AUTH')) { | |
| define('HTTP_URL_STRIP_AUTH', 32); | |
| } | |
| if (!defined('HTTP_URL_STRIP_PORT')) { | |
| define('HTTP_URL_STRIP_PORT', 64); | |
| } | |
| if (!defined('HTTP_URL_STRIP_PATH')) { | |
| define('HTTP_URL_STRIP_PATH', 128); | |
| } | |
| if (!defined('HTTP_URL_STRIP_QUERY')) { | |
| define('HTTP_URL_STRIP_QUERY', 256); | |
| } | |
| if (!defined('HTTP_URL_STRIP_FRAGMENT')) { | |
| define('HTTP_URL_STRIP_FRAGMENT', 512); | |
| } | |
| if (!defined('HTTP_URL_STRIP_ALL')) { | |
| define('HTTP_URL_STRIP_ALL', 1024); | |
| } | |
| if (!function_exists('http_build_url')) { | |
| /** | |
| * Build a URL. | |
| * | |
| * The parts of the second URL will be merged into the first according to | |
| * the flags argument. | |
| * | |
| * @param mixed $url (part(s) of) an URL in form of a string or | |
| * associative array like parse_url() returns | |
| * @param mixed $parts same as the first argument | |
| * @param int $flags a bitmask of binary or'ed HTTP_URL constants; | |
| * HTTP_URL_REPLACE is the default | |
| * @param array $new_url if set, it will be filled with the parts of the | |
| * composed url like parse_url() would return | |
| * @return string | |
| */ | |
| function http_build_url($url, $parts = array(), $flags = HTTP_URL_REPLACE, &$new_url = array()) | |
| { | |
| is_array($url) || $url = parse_url($url); | |
| is_array($parts) || $parts = parse_url($parts); | |
| isset($url['query']) && is_string($url['query']) || $url['query'] = null; | |
| isset($parts['query']) && is_string($parts['query']) || $parts['query'] = null; | |
| $keys = array('user', 'pass', 'port', 'path', 'query', 'fragment'); | |
| // HTTP_URL_STRIP_ALL and HTTP_URL_STRIP_AUTH cover several other flags. | |
| if ($flags & HTTP_URL_STRIP_ALL) { | |
| $flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS | |
| | HTTP_URL_STRIP_PORT | HTTP_URL_STRIP_PATH | |
| | HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT; | |
| } elseif ($flags & HTTP_URL_STRIP_AUTH) { | |
| $flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS; | |
| } | |
| // Schema and host are alwasy replaced | |
| foreach (array('scheme', 'host') as $part) { | |
| if (isset($parts[$part])) { | |
| $url[$part] = $parts[$part]; | |
| } | |
| } | |
| if ($flags & HTTP_URL_REPLACE) { | |
| foreach ($keys as $key) { | |
| if (isset($parts[$key])) { | |
| $url[$key] = $parts[$key]; | |
| } | |
| } | |
| } else { | |
| if (isset($parts['path']) && ($flags & HTTP_URL_JOIN_PATH)) { | |
| if (isset($url['path']) && substr($parts['path'], 0, 1) !== '/') { | |
| // Workaround for trailing slashes | |
| $url['path'] .= 'a'; | |
| $url['path'] = rtrim( | |
| str_replace(basename($url['path']), '', $url['path']), | |
| '/' | |
| ) . '/' . ltrim($parts['path'], '/'); | |
| } else { | |
| $url['path'] = $parts['path']; | |
| } | |
| } | |
| if (isset($parts['query']) && ($flags & HTTP_URL_JOIN_QUERY)) { | |
| if (isset($url['query'])) { | |
| parse_str($url['query'], $url_query); | |
| parse_str($parts['query'], $parts_query); | |
| $url['query'] = http_build_query( | |
| array_replace_recursive( | |
| $url_query, | |
| $parts_query | |
| ) | |
| ); | |
| } else { | |
| $url['query'] = $parts['query']; | |
| } | |
| } | |
| } | |
| if (isset($url['path']) && $url['path'] !== '' && substr($url['path'], 0, 1) !== '/') { | |
| $url['path'] = '/' . $url['path']; | |
| } | |
| foreach ($keys as $key) { | |
| $strip = 'HTTP_URL_STRIP_' . strtoupper($key); | |
| if ($flags & constant($strip)) { | |
| unset($url[$key]); | |
| } | |
| } | |
| $parsed_string = ''; | |
| if (!empty($url['scheme'])) { | |
| $parsed_string .= $url['scheme'] . '://'; | |
| } | |
| if (!empty($url['user'])) { | |
| $parsed_string .= $url['user']; | |
| if (isset($url['pass'])) { | |
| $parsed_string .= ':' . $url['pass']; | |
| } | |
| $parsed_string .= '@'; | |
| } | |
| if (!empty($url['host'])) { | |
| $parsed_string .= $url['host']; | |
| } | |
| if (!empty($url['port'])) { | |
| $parsed_string .= ':' . $url['port']; | |
| } | |
| if (!empty($url['path'])) { | |
| $parsed_string .= $url['path']; | |
| } | |
| if (!empty($url['query'])) { | |
| $parsed_string .= '?' . $url['query']; | |
| } | |
| if (!empty($url['fragment'])) { | |
| $parsed_string .= '#' . $url['fragment']; | |
| } | |
| $new_url = $url; | |
| return $parsed_string; | |
| } | |
| } |
Author
@acesuares As far as I can tell, http_build_query() is a standard PHP function that shouldn't require any modules installed. Which PHP version is your server running?
This Stack Overflow question has a replacement function you can use.
Its _url, not _query
oh yeah, the stackoverflow post, makes urls with %2F instead of / and somehow those urls don't work. If i replace all the %2F with / manually the urls do work
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In my wordpress setup, the intermediary script complains about missing http_build_url() function, installing PECL doesn't really help.