Utility.php in Search API Solr 8.2
Same filename and directory in other branches
Namespace
Drupal\search_api_solr\UtilityFile
src/Utility/Utility.phpView source
<?php
namespace Drupal\search_api_solr\Utility;
use Drupal\Component\Utility\NestedArray;
use Drupal\search_api\ServerInterface;
use Drupal\search_api_solr\Entity\SolrFieldType;
use Drupal\search_api_solr\SolrFieldTypeInterface;
/**
* The separator to indicate the start of a language ID. We must not use any
* character that has a special meaning within regular expressions. Additionally
* we have to avoid characters that are valid for Drupal machine names.
* The end of a language ID is indicated by an underscore '_' which could not
* occur within the language ID itself because Drupal uses lanague tags.
*
* @see http://de2.php.net/manual/en/regexp.reference.meta.php
* @see https://www.w3.org/International/articles/language-tags/
*/
define('SEARCH_API_SOLR_LANGUAGE_SEPARATOR', ';');
/**
* Provides various helper functions for Solr backends.
*/
class Utility {
/**
* Retrieves Solr-specific data for available data types.
*
* Returns the data type information for the default Search API datatypes, the
* Solr specific data types and custom data types defined by
* hook_search_api_data_type_info().
* Names for default data types are not included, since they are not relevant
* to the Solr service class.
*
* We're adding some extra Solr field information for the default search api
* data types (as well as on behalf of a couple contrib field types). The
* extra information we're adding is documented in
* search_api_solr_hook_search_api_data_type_info(). You can use the same
* additional keys in hook_search_api_data_type_info() to support custom
* dynamic fields in your indexes with Solr.
*
* @param string|null $type
* (optional) A specific type for which the information should be returned.
* Defaults to returning all information.
*
* @return array|null
* If $type was given, information about that type or NULL if it is unknown.
* Otherwise, an array of all types. The format in both cases is the same as
* for search_api_get_data_type_info().
*
* @see search_api_get_data_type_info()
* @see search_api_solr_hook_search_api_data_type_info()
*/
public static function getDataTypeInfo($type = NULL) {
$types =& drupal_static(__FUNCTION__);
if (!isset($types)) {
// Grab the stock search_api data types.
/** @var \Drupal\search_api\DataType\DataTypePluginManager $data_type_service */
$data_type_service = \Drupal::service('plugin.manager.search_api.data_type');
$types = $data_type_service
->getDefinitions();
// Add our extras for the default search api fields.
$types = NestedArray::mergeDeep($types, [
'text' => [
'prefix' => 't',
],
'string' => [
'prefix' => 's',
],
'integer' => [
// Use trie field for better sorting.
'prefix' => 'it',
],
'decimal' => [
// Use trie field for better sorting.
'prefix' => 'ft',
],
'date' => [
'prefix' => 'd',
],
'duration' => [
// Use trie field for better sorting.
'prefix' => 'it',
],
'boolean' => [
'prefix' => 'b',
],
'uri' => [
'prefix' => 's',
],
]);
// Extra data type info.
$extra_types_info = [
// Provided by Search API Location module.
'location' => [
'prefix' => 'loc',
],
// @todo Who provides that type?
'geohash' => [
'prefix' => 'geo',
],
// Provided by Search API Location module.
'rpt' => [
'prefix' => 'rpt',
],
];
// For the extra types, only add our extra info if it's already been
// defined.
foreach ($extra_types_info as $key => $info) {
if (array_key_exists($key, $types)) {
// Merge our extras into the data type info.
$types[$key] += $info;
}
}
}
// Return the info.
if (isset($type)) {
return isset($types[$type]) ? $types[$type] : NULL;
}
return $types;
}
/**
* Returns a unique hash for the current site.
*
* This is used to identify Solr documents from different sites within a
* single Solr server.
*
* @return string
* A unique site hash, containing only alphanumeric characters.
*/
public static function getSiteHash() {
// Copied from apachesolr_site_hash().
if (!($hash = \Drupal::config('search_api_solr.settings')
->get('site_hash'))) {
global $base_url;
$hash = substr(base_convert(sha1(uniqid($base_url, TRUE)), 16, 36), 0, 6);
\Drupal::configFactory()
->getEditable('search_api_solr.settings')
->set('site_hash', $hash)
->save();
}
return $hash;
}
/**
* Retrieves a list of all config files of a server's Solr backend.
*
* @param \Drupal\search_api\ServerInterface $server
* The Solr server whose files should be retrieved.
* @param string $dir_name
* (optional) The directory that should be searched for files. Defaults to
* the root config directory.
*
* @return array
* An associative array of all config files in the given directory. The keys
* are the file names, values are arrays with information about the file.
* The files are returned in alphabetical order and breadth-first.
*
* @throws \Drupal\search_api\SearchApiException
* If a problem occurred while retrieving the files.
*/
public static function getServerFiles(ServerInterface $server, $dir_name = NULL) {
/** @var \Drupal\search_api_solr\SolrBackendInterface $backend */
$backend = $server
->getBackend();
$response = $backend
->getSolrConnector()
->getFile($dir_name);
// Search for directories and recursively merge directory files.
$files_data = json_decode($response
->getBody(), TRUE);
$files_list = $files_data['files'];
$dir_length = strlen($dir_name) + 1;
$result = [
'' => [],
];
foreach ($files_list as $file_name => $file_info) {
// Annoyingly, Solr 4.7 changed the way the admin/file handler returns
// the file names when listing directory contents: the returned name is
// now only the base name, not the complete path from the config root
// directory. We therefore have to check for this case.
if ($dir_name && substr($file_name, 0, $dir_length) !== "{$dir_name}/") {
$file_name = "{$dir_name}/" . $file_name;
}
if (empty($file_info['directory'])) {
$result[''][$file_name] = $file_info;
}
else {
$result[$file_name] = static::getServerFiles($server, $file_name);
}
}
ksort($result);
ksort($result['']);
return array_reduce($result, 'array_merge', []);
}
/**
* Returns the highlighted keys from a snippet highlighted by Solr.
*
* @param string|array $snippets
* The snippet(s) to format.
*
* @return array
* The highlighted keys.
*/
public static function getHighlightedKeys($snippets) {
if (is_string($snippets)) {
$snippets = [
$snippets,
];
}
$keys = [];
foreach ($snippets as $snippet) {
if (preg_match_all('@\\[HIGHLIGHT\\](.+?)\\[/HIGHLIGHT\\]@', $snippet, $matches)) {
$keys = array_merge($keys, $matches[1]);
}
}
return array_unique($keys);
}
/**
* Changes highlighting tags from our custom, HTML-safe ones to HTML.
*
* @param string|array $snippet
* The snippet(s) to format.
*
* @return string|array
* The snippet(s), properly formatted as HTML.
*/
public static function formatHighlighting($snippet, $prefix = '<strong>', $suffix = '</strong>') {
return str_replace([
'[HIGHLIGHT]',
'[/HIGHLIGHT]',
], [
$prefix,
$suffix,
], $snippet);
}
/**
* Encodes field names to avoid characters that are not supported by solr.
*
* Solr doesn't restrict the characters used to build field names. But using
* non java identifiers within a field name can cause different kind of
* trouble when running queries. Java identifiers are only consist of
* letters, digits, '$' and '_'. See
* https://issues.apache.org/jira/browse/SOLR-3996 and
* http://docs.oracle.com/cd/E19798-01/821-1841/bnbuk/index.html
* For full compatibility the '$' has to be avoided, too. And there're more
* restrictions regarding the field name itself. See
* https://cwiki.apache.org/confluence/display/solr/Defining+Fields
* "Field names should consist of alphanumeric or underscore characters only
* and not start with a digit ... Names with both leading and trailing
* underscores (e.g. _version_) are reserved." Field names starting with
* digits or underscores are already avoided by our schema. The same is true
* for the names of field types. See
* https://cwiki.apache.org/confluence/display/solr/Field+Type+Definitions+and+Properties
* "It is strongly recommended that names consist of alphanumeric or
* underscore characters only and not start with a digit. This is not
* currently strictly enforced."
*
* This function therefore encodes all forbidden characters in their
* hexadecimal equivalent encapsulated by a leading sequence of '_X' and a
* termination character '_'. Example:
* "tm_entity:node/body" becomes "tm_entity_X3a_node_X2f_body".
*
* As a consequence the sequence '_X' itself needs to be encoded if it occurs
* within a field name. Example: "last_XMas" becomes "last_X5f58_Mas".
*
* @param string $field_name
* The field name.
*
* @return string
* The encoded field name.
*/
public static function encodeSolrName($field_name) {
return preg_replace_callback('/([^\\da-zA-Z_]|_X)/u', function ($matches) {
return '_X' . bin2hex($matches[1]) . '_';
}, $field_name);
}
/**
* Decodes solr field names.
*
* This function therefore decodes all forbidden characters from their
* hexadecimal equivalent encapsulated by a leading sequence of '_X' and a
* termination character '_'. Example:
* "tm_entity_X3a_node_X2f_body" becomes "tm_entity:node/body".
*
* @see encodeSolrDynamicFieldName() for details.
*
* @param string $field_name
* Encoded field name.
*
* @return string
* The decoded field name
*/
public static function decodeSolrName($field_name) {
return preg_replace_callback('/_X([\\dabcdef]+?)_/', function ($matches) {
return hex2bin($matches[1]);
}, $field_name);
}
/**
* Maps a Solr field name to its language-specific equivalent.
*
* For example the dynamic field tm_* will become tm;en* for English.
* Following this pattern we also have fall backs automatically:
* - tm;de-AT_*
* - tm;de_*
* - tm_*
* This concept bases on the fact that "longer patterns will be matched first.
* If equal size patterns both match,the first appearing in the schema will be
* used." This is not obvious from the example above. But you need to take
* into account that the real field name for solr will be encoded. So the real
* values for the example above are:
* - tm_X3b_de_X2d_AT_*
* - tm_X3b_de_*
* - tm_*
*
* @see \Drupal\search_api_solr\Utility\Utility::encodeSolrName()
* @see https://wiki.apache.org/solr/SchemaXml#Dynamic_fields
*
* @param string $field_name
* The field name.
* @param string $language_id
* The Drupal langauge code.
*
* @return string
* The language-specific name.
*/
public static function getLanguageSpecificSolrDynamicFieldNameForSolrDynamicFieldName($field_name, $language_id) {
if ('twm_suggest' == $field_name) {
return 'twm_suggest';
}
return Utility::modifySolrDynamicFieldName($field_name, '@^([a-z]+)_@', '$1' . SEARCH_API_SOLR_LANGUAGE_SEPARATOR . $language_id . '_');
}
/**
* Maps a language-specific Solr field name to its unspecific equivalent.
*
* For example the dynamic field tm;en_* for English will become tm_*.
*
* @see \Drupal\search_api_solr\Utility\Utility::getLanguageSpecificSolrDynamicFieldNameForSolrDynamicFieldName()
* @see \Drupal\search_api_solr\Utility\Utility::encodeSolrName()
* @see https://wiki.apache.org/solr/SchemaXml#Dynamic_fields
*
* @param string $field_name
* The field name.
* @param string $language_id
* The Drupal langauge code.
*
* @return string
* The language-specific name.
*/
public static function getSolrDynamicFieldNameForLanguageSpecificSolrDynamicFieldName($field_name) {
return Utility::modifySolrDynamicFieldName($field_name, '@^([a-z]+)' . SEARCH_API_SOLR_LANGUAGE_SEPARATOR . '[^_]+?_@', '$1_');
}
/**
* Modifies a dynamic Solr field's name using a regular expression.
*
* If the field name is encoded it will be decoded before the regular
* expression runs and encoded again before the modified is returned.
*
* @see \Drupal\search_api_solr\Utility\Utility::encodeSolrName()
*
* @param string $field_name
* The dynamic Solr field name.
* @param $pattern
* The regex.
* @param $replacement
* The replacement for the pattern match.
*
* @return string
* The modified dynamic Solr field name.
*/
protected static function modifySolrDynamicFieldName($field_name, $pattern, $replacement) {
$decoded_field_name = Utility::decodeSolrName($field_name);
$modified_field_name = preg_replace($pattern, $replacement, $decoded_field_name);
if ($decoded_field_name != $field_name) {
$modified_field_name = Utility::encodeSolrName($modified_field_name);
}
return $modified_field_name;
}
/**
* Gets the language-specific prefix for a dynamic Solr field.
*
* @param string $prefix
* The language-unspecific prefix.
* @param string $language_id
* The Drupal language code.
*
* @return string
* The language-specific prefix.
*/
public static function getLanguageSpecificSolrDynamicFieldPrefix($prefix, $language_id) {
return $prefix . SEARCH_API_SOLR_LANGUAGE_SEPARATOR . $language_id . '_';
}
/**
* Extracts the language code from a language-specific dynamic Solr field.
*
* @param string $field_name
* The language-specific dynamic Solr field name.
*
* @return mixed
* The Drupal language code as string or boolean FALSE if no language code
* could be extracted.
*/
public static function getLanguageIdFromLanguageSpecificSolrDynamicFieldName($field_name) {
$decoded_field_name = Utility::decodeSolrName($field_name);
if (preg_match('@^[a-z]+' . SEARCH_API_SOLR_LANGUAGE_SEPARATOR . '([^_]+?)_@', $decoded_field_name, $matches)) {
return $matches[1];
}
return FALSE;
}
/**
* Extracts the language-specific definition from a dynamic Solr field.
*
* @param string $field_name
* The field name.
*
* @return mixed
* The language-specific prefix as string or boolean FALSE if no prefix
* could be extracted.
*/
public static function extractLanguageSpecificSolrDynamicFieldDefinition($field_name) {
$decoded_field_name = Utility::decodeSolrName($field_name);
if (preg_match('@^[a-z]+' . SEARCH_API_SOLR_LANGUAGE_SEPARATOR . '[^_]+?_@', $decoded_field_name, $matches)) {
return Utility::encodeSolrName($matches[0]) . '*';
}
return FALSE;
}
/**
* @param array $tags
*
* @return string
*/
public static function buildSuggesterContextFilterQuery(array $tags) {
$cfq = [];
foreach ($tags as $tag) {
$cfg[] = '+' . self::encodeSolrName($tag);
}
return implode(' ', $cfg);
}
/**
* Returns the complete file name for a text file.
*
* @param string $text_file_name
* @param SolrFieldTypeInterface $solr_field_type
*
* @return string
*/
public static function completeTextFileName(string $text_file_name, SolrFieldTypeInterface $solr_field_type) {
if ($custom_code = $solr_field_type
->getCustomCode()) {
$text_file_name .= '_' . $custom_code;
}
return $text_file_name . '_' . $solr_field_type
->getFieldTypeLanguageCode() . '.txt';
}
}
Constants
Name | Description |
---|---|
SEARCH_API_SOLR_LANGUAGE_SEPARATOR | The separator to indicate the start of a language ID. We must not use any character that has a special meaning within regular expressions. Additionally we have to avoid characters that are valid for Drupal machine names. The end of a language ID is… |