Revision 603

Date:
2016/10/24 15:31:01
Author:
ahitrov
Revision Log:
Sample sphinx.conf, search methods

Files:

Legend:

 
Added
 
Removed
 
Modified
  • utf8/plugins/sphinx/config.proto

     
    7 7
    8 8 PLUGINS += sphinx
    9 9
    10 SPHINX_HOST = localhost
    10 SPHINX_HOST = 127.0.0.1
    11 11 SPHINX_PORT = 9306
    12 SPHINX_DATA = /path/to/sphinx/database/
    13 SPHINX_TABLE = indextable
    14 SPHINX_TABLE_STEMMED = indextablestemmed
    12 15
    13 REWRITE += SPHINX_HOST SPHINX_PORT
    16 REWRITE += SPHINX_HOST SPHINX_PORT SPHINX_DATA SPHINX_TABLE SPHINX_TABLE_STEMMED
  • utf8/plugins/sphinx/etc/sphinx.conf

     
    1 #
    2 # Sphinx configuration file sample
    3 #
    4 # WARNING! While this sample file mentions all available options,
    5 # it contains (very) short helper descriptions only. Please refer to
    6 # doc/sphinx.html for details.
    7 #
    8
    9 #############################################################################
    10 ## data source definition
    11 #############################################################################
    12
    13 source zvukiru
    14 {
    15 # data source type. mandatory, no default value
    16 # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
    17 type = pgsql
    18
    19 #####################################################################
    20 ## SQL settings (for 'mysql' and 'pgsql' types)
    21 #####################################################################
    22
    23 # some straightforward parameters for SQL source types
    24 sql_host = localhost
    25 sql_user = zvuki
    26 sql_pass = sarUchOov
    27 sql_db = zvukirutf
    28 sql_port = 5432 # optional, default is 3306
    29
    30 # UNIX socket name
    31 # optional, default is empty (reuse client library defaults)
    32 # usually '/var/lib/mysql/mysql.sock' on Linux
    33 # usually '/tmp/mysql.sock' on FreeBSD
    34 #
    35 # sql_sock = /tmp/mysql.sock
    36
    37
    38 # MySQL specific client connection flags
    39 # optional, default is 0
    40 #
    41 # mysql_connect_flags = 32 # enable compression
    42
    43 # MySQL specific SSL certificate settings
    44 # optional, defaults are empty
    45 #
    46 # mysql_ssl_cert = /etc/ssl/client-cert.pem
    47 # mysql_ssl_key = /etc/ssl/client-key.pem
    48 # mysql_ssl_ca = /etc/ssl/cacert.pem
    49
    50 # MS SQL specific Windows authentication mode flag
    51 # MUST be in sync with charset_type index-level setting
    52 # optional, default is 0
    53 #
    54 # mssql_winauth = 1 # use currently logged on user credentials
    55
    56
    57 # ODBC specific DSN (data source name)
    58 # mandatory for odbc source type, no default value
    59 #
    60 # odbc_dsn = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
    61 # sql_query = SELECT id, data FROM documents.csv
    62
    63
    64 # ODBC and MS SQL specific, per-column buffer sizes
    65 # optional, default is auto-detect
    66 #
    67 # sql_column_buffers = content=12M, comments=1M
    68
    69
    70 # pre-query, executed before the main fetch query
    71 # multi-value, optional, default is empty list of queries
    72 #
    73 # sql_query_pre = SET NAMES utf8
    74 # sql_query_pre = SET SESSION query_cache_type=OFF
    75
    76
    77 # main document fetch query
    78 # mandatory, integer document ID field MUST be the first selected column
    79 sql_query = \
    80 SELECT id, object_id, object_class, extract(epoch from date_trunc('seconds', mtime)) AS last_edited, is_deleted, name as title, search as content \
    81 FROM search
    82
    83
    84 # joined/payload field fetch query
    85 # joined fields let you avoid (slow) JOIN and GROUP_CONCAT
    86 # payload fields let you attach custom per-keyword values (eg. for ranking)
    87 #
    88 # syntax is FIELD-NAME 'from' ( 'query' | 'payload-query' ); QUERY
    89 # joined field QUERY should return 2 columns (docid, text)
    90 # payload field QUERY should return 3 columns (docid, keyword, weight)
    91 #
    92 # REQUIRES that query results are in ascending document ID order!
    93 # multi-value, optional, default is empty list of queries
    94 #
    95 # sql_joined_field = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
    96 # sql_joined_field = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC
    97
    98
    99 # file based field declaration
    100 #
    101 # content of this field is treated as a file name
    102 # and the file gets loaded and indexed in place of a field
    103 #
    104 # max file size is limited by max_file_field_buffer indexer setting
    105 # file IO errors are non-fatal and get reported as warnings
    106 #
    107 # sql_file_field = content_file_path
    108
    109
    110 # range query setup, query that must return min and max ID values
    111 # optional, default is empty
    112 #
    113 # sql_query will need to reference $start and $end boundaries
    114 # if using ranged query:
    115 #
    116 # sql_query = \
    117 # SELECT doc.id, doc.id AS group, doc.title, doc.data \
    118 # FROM documents doc \
    119 # WHERE id>=$start AND id<=$end
    120 #
    121 # sql_query_range = SELECT MIN(id),MAX(id) FROM documents
    122
    123
    124 # range query step
    125 # optional, default is 1024
    126 #
    127 sql_range_step = 1000
    128
    129
    130 # unsigned integer attribute declaration
    131 # multi-value (an arbitrary number of attributes is allowed), optional
    132 # optional bit size can be specified, default is 32
    133 #
    134 # sql_attr_uint = author_id
    135 # sql_attr_uint = forum_id:9 # 9 bits for forum_id
    136 sql_attr_uint = object_id
    137
    138 # boolean attribute declaration
    139 # multi-value (an arbitrary number of attributes is allowed), optional
    140 # equivalent to sql_attr_uint with 1-bit size
    141 #
    142 sql_attr_bool = is_deleted
    143
    144
    145 # bigint attribute declaration
    146 # multi-value (an arbitrary number of attributes is allowed), optional
    147 # declares a signed (unlike uint!) 64-bit attribute
    148 #
    149 # sql_attr_bigint = my_bigint_id
    150
    151
    152 # UNIX timestamp attribute declaration
    153 # multi-value (an arbitrary number of attributes is allowed), optional
    154 # similar to integer, but can also be used in date functions
    155 #
    156 # sql_attr_timestamp = posted_ts
    157 sql_attr_timestamp = last_edited
    158 # sql_attr_timestamp = date_added
    159
    160
    161 # floating point attribute declaration
    162 # multi-value (an arbitrary number of attributes is allowed), optional
    163 # values are stored in single precision, 32-bit IEEE 754 format
    164 #
    165 # sql_attr_float = lat_radians
    166 # sql_attr_float = long_radians
    167
    168
    169 # multi-valued attribute (MVA) attribute declaration
    170 # multi-value (an arbitrary number of attributes is allowed), optional
    171 # MVA values are variable length lists of unsigned 32-bit integers
    172 #
    173 # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
    174 # ATTR-TYPE is 'uint' or 'timestamp'
    175 # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
    176 # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
    177 # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
    178 #
    179 # sql_attr_multi = uint tag from query; SELECT docid, tagid FROM tags
    180 # sql_attr_multi = uint tag from ranged-query; \
    181 # SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \
    182 # SELECT MIN(docid), MAX(docid) FROM tags
    183
    184
    185 # string attribute declaration
    186 # multi-value (an arbitrary number of these is allowed), optional
    187 # lets you store and retrieve strings
    188 #
    189 sql_attr_string = object_class
    190
    191
    192 # JSON attribute declaration
    193 # multi-value (an arbitrary number of these is allowed), optional
    194 # lets you store a JSON document as an (in-memory) attribute for later use
    195 #
    196 # sql_attr_json = properties
    197
    198
    199 # combined field plus attribute declaration (from a single column)
    200 # stores column as an attribute, but also indexes it as a full-text field
    201 #
    202 # sql_field_string = author
    203
    204
    205 # post-query, executed on sql_query completion
    206 # optional, default is empty
    207 #
    208 # sql_query_post =
    209
    210
    211 # post-index-query, executed on successful indexing completion
    212 # optional, default is empty
    213 # $maxid expands to max document ID actually fetched from DB
    214 #
    215 # sql_query_post_index = REPLACE INTO counters ( id, val ) \
    216 # VALUES ( 'max_indexed_id', $maxid )
    217
    218
    219 # ranged query throttling, in milliseconds
    220 # optional, default is 0 which means no delay
    221 # enforces given delay before each query step
    222 sql_ranged_throttle = 0
    223
    224
    225 # kill-list query, fetches the document IDs for kill-list
    226 # k-list will suppress matches from preceding indexes in the same query
    227 # optional, default is empty
    228 #
    229 # sql_query_killlist = SELECT id FROM documents WHERE edited>=@last_reindex
    230
    231
    232 # columns to unpack on indexer side when indexing
    233 # multi-value, optional, default is empty list
    234 #
    235 # unpack_zlib = zlib_column
    236 # unpack_mysqlcompress = compressed_column
    237 # unpack_mysqlcompress = compressed_column_2
    238
    239
    240 # maximum unpacked length allowed in MySQL COMPRESS() unpacker
    241 # optional, default is 16M
    242 #
    243 # unpack_mysqlcompress_maxsize = 16M
    244
    245
    246 # hook command to run when SQL connection succeeds
    247 # optional, default value is empty (do nothing)
    248 #
    249 # hook_connect = bash sql_connect.sh
    250
    251
    252 # hook command to run after (any) SQL range query
    253 # it may print out "minid maxid" (w/o quotes) to override the range
    254 # optional, default value is empty (do nothing)
    255 #
    256 # hook_query_range = bash sql_query_range.sh
    257
    258
    259 # hook command to run on successful indexing completion
    260 # $maxid expands to max document ID actually fetched from DB
    261 # optional, default value is empty (do nothing)
    262 #
    263 # hook_post_index = bash sql_post_index.sh $maxid
    264
    265 #####################################################################
    266 ## xmlpipe2 settings
    267 #####################################################################
    268
    269 # type = xmlpipe
    270
    271 # shell command to invoke xmlpipe stream producer
    272 # mandatory
    273 #
    274 # xmlpipe_command = cat /var/db/sphinxsearch/test.xml
    275
    276 # xmlpipe2 field declaration
    277 # multi-value, optional, default is empty
    278 #
    279 # xmlpipe_field = subject
    280 # xmlpipe_field = content
    281
    282
    283 # xmlpipe2 attribute declaration
    284 # multi-value, optional, default is empty
    285 # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
    286 # examples:
    287 #
    288 # xmlpipe_attr_timestamp = published
    289 # xmlpipe_attr_uint = author_id
    290 # xmlpipe_attr_bool = is_enabled
    291 # xmlpipe_attr_float = latitude
    292 # xmlpipe_attr_bigint = guid
    293 # xmlpipe_attr_multi = tags
    294 # xmlpipe_attr_multi_64 = tags64
    295 # xmlpipe_attr_string = title
    296 # xmlpipe_attr_json = extra_data
    297 # xmlpipe_field_string = content
    298
    299
    300 # perform UTF-8 validation, and filter out incorrect codes
    301 # avoids XML parser choking on non-UTF-8 documents
    302 # optional, default is 0
    303 #
    304 # xmlpipe_fixup_utf8 = 1
    305 }
    306
    307
    308 # inherited source example
    309 #
    310 # all the parameters are copied from the parent source,
    311 # and may then be overridden in this source definition
    312 source zvukiruthrottled : zvukiru
    313 {
    314 sql_ranged_throttle = 100
    315 }
    316
    317 #############################################################################
    318 ## index definition
    319 #############################################################################
    320
    321 # local index example
    322 #
    323 # this is an index which is stored locally in the filesystem
    324 #
    325 # all indexing-time options (such as morphology and charsets)
    326 # are configured per local index
    327 index zvukiru
    328 {
    329 # index type
    330 # optional, default is 'plain'
    331 # known values are 'plain', 'distributed', and 'rt' (see samples below)
    332 # type = plain
    333
    334 # document source(s) to index
    335 # multi-value, mandatory
    336 # document IDs must be globally unique across all sources
    337 source = zvukiru
    338
    339 # index files path and file name, without extension
    340 # mandatory, path must be writable, extensions will be auto-appended
    341 path = /var/db/sphinxsearch/data/zvukiru
    342
    343 # document attribute values (docinfo) storage mode
    344 # optional, default is 'extern'
    345 # known values are 'none', 'extern' and 'inline'
    346 docinfo = extern
    347
    348 # dictionary type, 'crc' or 'keywords'
    349 # crc is faster to index when no substring/wildcards searches are needed
    350 # crc with substrings might be faster to search but is much slower to index
    351 # (because all substrings are pre-extracted as individual keywords)
    352 # keywords is much faster to index with substrings, and index is much (3-10x) smaller
    353 # keywords supports wildcards, crc does not, and never will
    354 # optional, default is 'keywords'
    355 dict = keywords
    356
    357 # memory locking for cached data (.spa and .spi), to prevent swapping
    358 # optional, default is 0 (do not mlock)
    359 # requires searchd to be run from root
    360 mlock = 0
    361
    362 # a list of morphology preprocessors to apply
    363 # optional, default is empty
    364 #
    365 # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
    366 # 'soundex', and 'metaphone'; additional preprocessors available from
    367 # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
    368 # (see libstemmer_c/libstemmer/modules.txt)
    369 #
    370 morphology = stem_en, stem_ru, soundex
    371 # morphology = libstemmer_german
    372 # morphology = libstemmer_sv
    373 # morphology = none
    374
    375 # minimum word length at which to enable stemming
    376 # optional, default is 1 (stem everything)
    377 #
    378 min_stemming_len = 2
    379
    380
    381 # stopword files list (space separated)
    382 # optional, default is empty
    383 # contents are plain text, charset_table and stemming are both applied
    384 #
    385 # stopwords = /var/db/sphinxsearch/data/stopwords.txt
    386
    387
    388 # wordforms file, in "mapfrom > mapto" plain text format
    389 # optional, default is empty
    390 #
    391 # wordforms = /var/db/sphinxsearch/data/wordforms.txt
    392
    393
    394 # tokenizing exceptions file
    395 # optional, default is empty
    396 #
    397 # plain text, case sensitive, space insensitive in map-from part
    398 # one "Map Several Words => ToASingleOne" entry per line
    399 #
    400 # exceptions = /var/db/sphinxsearch/data/exceptions.txt
    401
    402
    403 # embedded file size limit
    404 # optional, default is 16K
    405 #
    406 # exceptions, wordforms, and stopwords files smaller than this limit
    407 # are stored in the index; otherwise, their paths and sizes are stored
    408 #
    409 # embedded_limit = 16K
    410
    411 # minimum indexed word length
    412 # default is 1 (index everything)
    413 min_word_len = 2
    414
    415
    416 # ignored characters list
    417 # optional, default value is empty
    418 #
    419 # ignore_chars = U+00AD
    420
    421
    422 # minimum word prefix length to index
    423 # optional, default is 0 (do not index prefixes)
    424 #
    425 # min_prefix_len = 0
    426
    427
    428 # minimum word infix length to index
    429 # optional, default is 0 (do not index infixes)
    430 #
    431 # min_infix_len = 0
    432
    433
    434 # maximum substring (prefix or infix) length to index
    435 # optional, default is 0 (do not limit substring length)
    436 #
    437 # max_substring_len = 8
    438
    439
    440 # list of fields to limit prefix/infix indexing to
    441 # optional, default value is empty (index all fields in prefix/infix mode)
    442 #
    443 # prefix_fields = filename
    444 # infix_fields = url, domain
    445
    446
    447 # expand keywords with exact forms and/or stars when searching fit indexes
    448 # search-time only, does not affect indexing, can be 0 or 1
    449 # optional, default is 0 (do not expand keywords)
    450 #
    451 # expand_keywords = 1
    452
    453
    454 # n-gram length to index, for CJK indexing
    455 # only supports 0 and 1 for now, other lengths to be implemented
    456 # optional, default is 0 (disable n-grams)
    457 #
    458 # ngram_len = 1
    459
    460
    461 # n-gram characters list, for CJK indexing
    462 # optional, default is empty
    463 #
    464 # ngram_chars = U+3000..U+2FA1F
    465
    466
    467 # phrase boundary characters list
    468 # optional, default is empty
    469 #
    470 # phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis
    471
    472
    473 # phrase boundary word position increment
    474 # optional, default is 0
    475 #
    476 # phrase_boundary_step = 100
    477
    478
    479 # blended characters list
    480 # blended chars are indexed both as separators and valid characters
    481 # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
    482 # optional, default is empty
    483 #
    484 # blend_chars = +, &, U+23
    485
    486
    487 # blended token indexing mode
    488 # a comma separated list of blended token indexing variants
    489 # known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
    490 # optional, default is trim_none
    491 #
    492 # blend_mode = trim_tail, skip_pure
    493
    494
    495 # whether to strip HTML tags from incoming documents
    496 # known values are 0 (do not strip) and 1 (do strip)
    497 # optional, default is 0
    498 html_strip = 0
    499
    500 # what HTML attributes to index if stripping HTML
    501 # optional, default is empty (do not index anything)
    502 #
    503 # html_index_attrs = img=alt,title; a=title;
    504
    505
    506 # what HTML elements contents to strip
    507 # optional, default is empty (do not strip element contents)
    508 #
    509 # html_remove_elements = style, script
    510
    511
    512 # whether to preopen index data files on startup
    513 # optional, default is 0 (do not preopen), searchd-only
    514 #
    515 # preopen = 1
    516
    517
    518 # whether to enable in-place inversion (2x less disk, 90-95% speed)
    519 # optional, default is 0 (use separate temporary files), indexer-only
    520 #
    521 # inplace_enable = 1
    522
    523
    524 # in-place fine-tuning options
    525 # optional, defaults are listed below
    526 #
    527 # inplace_hit_gap = 0 # preallocated hitlist gap size
    528 # inplace_docinfo_gap = 0 # preallocated docinfo gap size
    529 # inplace_reloc_factor = 0.1 # relocation buffer size within arena
    530 # inplace_write_factor = 0.1 # write buffer size within arena
    531
    532
    533 # whether to index original keywords along with stemmed versions
    534 # enables "=exactform" operator to work
    535 # optional, default is 0
    536 #
    537 # index_exact_words = 1
    538
    539
    540 # position increment on overshort (less that min_word_len) words
    541 # optional, allowed values are 0 and 1, default is 1
    542 #
    543 # overshort_step = 1
    544
    545
    546 # position increment on stopword
    547 # optional, allowed values are 0 and 1, default is 1
    548 #
    549 # stopword_step = 1
    550
    551
    552 # hitless words list
    553 # positions for these keywords will not be stored in the index
    554 # optional, allowed values are 'all', or a list file name
    555 #
    556 # hitless_words = all
    557 # hitless_words = hitless.txt
    558
    559
    560 # detect and index sentence and paragraph boundaries
    561 # required for the SENTENCE and PARAGRAPH operators to work
    562 # optional, allowed values are 0 and 1, default is 0
    563 #
    564 # index_sp = 1
    565
    566
    567 # index zones, delimited by HTML/XML tags
    568 # a comma separated list of tags and wildcards
    569 # required for the ZONE operator to work
    570 # optional, default is empty string (do not index zones)
    571 #
    572 # index_zones = title, h*, th
    573
    574
    575 # index per-document and average per-index field lengths, in tokens
    576 # required for the BM25A(), BM25F() in expression ranker
    577 # optional, default is 0 (do not index field lenghts)
    578 #
    579 # index_field_lengths = 1
    580
    581
    582 # regular expressions (regexps) to filter the fields and queries with
    583 # gets applied to data source fields when indexing
    584 # gets applied to search queries when searching
    585 # multi-value, optional, default is empty list of regexps
    586 #
    587 # regexp_filter = \b(\d+)\" => \1inch
    588 # regexp_filter = (blue|red) => color
    589
    590
    591 # list of the words considered frequent with respect to bigram indexing
    592 # optional, default is empty
    593 #
    594 # bigram_freq_words = the, a, i, you, my
    595
    596
    597 # bigram indexing mode
    598 # known values are none, all, first_freq, both_freq
    599 # option, default is none (do not index bigrams)
    600 #
    601 # bigram_index = both_freq
    602
    603
    604 # snippet document file name prefix
    605 # preprended to file names when generating snippets using load_files option
    606 # WARNING, this is a prefix (not a path), trailing slash matters!
    607 # optional, default is empty
    608 #
    609 # snippets_file_prefix = /mnt/mydocs/server1
    610
    611
    612 # whether to apply stopwords before or after stemming
    613 # optional, default is 0 (apply stopwords after stemming)
    614 #
    615 # stopwords_unstemmed = 0
    616
    617
    618 # path to a global (cluster-wide) keyword IDFs file
    619 # optional, default is empty (use local IDFs)
    620 #
    621 # global_idf = /usr/local/sphinx/var/global.idf
    622 }
    623
    624
    625 # inherited index example
    626 #
    627 # all the parameters are copied from the parent index,
    628 # and may then be overridden in this index definition
    629 index zvukirustemmed : zvukiru
    630 {
    631 path = /var/db/sphinxsearch/data/zvukirustemmed
    632 morphology = stem_en
    633 }
    634
    635
    636 # distributed index example
    637 #
    638 # this is a virtual index which can NOT be directly indexed,
    639 # and only contains references to other local and/or remote indexes
    640 index dist1
    641 {
    642 # 'distributed' index type MUST be specified
    643 type = distributed
    644
    645 # local index to be searched
    646 # there can be many local indexes configured
    647 local = test1
    648 local = test1stemmed
    649
    650 # remote agent
    651 # multiple remote agents may be specified
    652 # syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
    653 # syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
    654 agent = localhost:9313:remote1
    655 agent = localhost:9314:remote2,remote3
    656 # agent = /var/run/searchd.sock:remote4
    657
    658 # remote agent mirrors groups, aka mirrors, aka HA agents
    659 # defines 2 or more interchangeable mirrors for a given index part
    660 #
    661 # agent = server3:9312 | server4:9312 :indexchunk2
    662 # agent = server3:9312:chunk2server3 | server4:9312:chunk2server4
    663 # agent = server3:chunk2server3 | server4:chunk2server4
    664 # agent = server21|server22|server23:chunk2
    665
    666
    667 # blackhole remote agent, for debugging/testing
    668 # network errors and search results will be ignored
    669 #
    670 # agent_blackhole = testbox:9312:testindex1,testindex2
    671
    672
    673 # persistenly connected remote agent
    674 # reduces connect() pressure, requires that workers IS threads
    675 #
    676 # agent_persistent = testbox:9312:testindex1,testindex2
    677
    678
    679 # remote agent connection timeout, milliseconds
    680 # optional, default is 1000 ms, ie. 1 sec
    681 agent_connect_timeout = 1000
    682
    683 # remote agent query timeout, milliseconds
    684 # optional, default is 3000 ms, ie. 3 sec
    685 agent_query_timeout = 3000
    686
    687 # HA mirror agent strategy
    688 # optional, defaults to ??? (random mirror)
    689 # know values are nodeads, noerrors, roundrobin, nodeadstm, noerrorstm
    690 #
    691 # ha_strategy = nodeads
    692
    693 # path to RLP context file
    694 # optional, defaut is empty
    695 #
    696 # rlp_context = /usr/local/share/sphinx/rlp/rlp-context.xml
    697 }
    698
    699
    700 # realtime index example
    701 #
    702 # you can run INSERT, REPLACE, and DELETE on this index on the fly
    703 # using MySQL protocol (see 'listen' directive below)
    704 index rt
    705 {
    706 # 'rt' index type must be specified to use RT index
    707 type = rt
    708
    709 # index files path and file name, without extension
    710 # mandatory, path must be writable, extensions will be auto-appended
    711 path = /var/db/sphinxsearch/data/rt
    712
    713 # RAM chunk size limit
    714 # RT index will keep at most this much data in RAM, then flush to disk
    715 # optional, default is 128M
    716 #
    717 # rt_mem_limit = 512M
    718
    719 # full-text field declaration
    720 # multi-value, mandatory
    721 rt_field = title
    722 rt_field = content
    723
    724 # unsigned integer attribute declaration
    725 # multi-value (an arbitrary number of attributes is allowed), optional
    726 # declares an unsigned 32-bit attribute
    727 rt_attr_uint = gid
    728
    729 # RT indexes currently support the following attribute types:
    730 # uint, bigint, float, timestamp, string, mva, mva64, json
    731 #
    732 # rt_attr_bigint = guid
    733 # rt_attr_float = gpa
    734 # rt_attr_timestamp = ts_added
    735 # rt_attr_string = author
    736 # rt_attr_multi = tags
    737 # rt_attr_multi_64 = tags64
    738 # rt_attr_json = extra_data
    739 }
    740
    741 #############################################################################
    742 ## indexer settings
    743 #############################################################################
    744
    745 indexer
    746 {
    747 # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
    748 # optional, default is 128M, max is 2047M, recommended is 256M to 1024M
    749 mem_limit = 128M
    750
    751 # maximum IO calls per second (for I/O throttling)
    752 # optional, default is 0 (unlimited)
    753 #
    754 # max_iops = 40
    755
    756
    757 # maximum IO call size, bytes (for I/O throttling)
    758 # optional, default is 0 (unlimited)
    759 #
    760 # max_iosize = 1048576
    761
    762
    763 # maximum xmlpipe2 field length, bytes
    764 # optional, default is 2M
    765 #
    766 # max_xmlpipe2_field = 4M
    767
    768
    769 # write buffer size, bytes
    770 # several (currently up to 4) buffers will be allocated
    771 # write buffers are allocated in addition to mem_limit
    772 # optional, default is 1M
    773 #
    774 # write_buffer = 1M
    775
    776
    777 # maximum file field adaptive buffer size
    778 # optional, default is 8M, minimum is 1M
    779 #
    780 # max_file_field_buffer = 32M
    781
    782
    783 # how to handle IO errors in file fields
    784 # known values are 'ignore_field', 'skip_document', and 'fail_index'
    785 # optional, default is 'ignore_field'
    786 #
    787 # on_file_field_error = skip_document
    788
    789
    790 # lemmatizer cache size
    791 # improves the indexing time when the lemmatization is enabled
    792 # optional, default is 256K
    793 #
    794 # lemmatizer_cache = 512M
    795 }
    796
    797 #############################################################################
    798 ## searchd settings
    799 #############################################################################
    800
    801 searchd
    802 {
    803 # [hostname:]port[:protocol], or /unix/socket/path to listen on
    804 # known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL)
    805 #
    806 # multi-value, multiple listen points are allowed
    807 # optional, defaults are 9312:sphinx and 9306:mysql41, as below
    808 #
    809 # listen = 127.0.0.1
    810 # listen = 192.168.0.1:9312
    811 # listen = 9312
    812 # listen = /var/run/searchd.sock
    813 listen = 9312
    814 listen = 9306:mysql41
    815
    816 # log file, searchd run info is logged here
    817 # optional, default is 'searchd.log'
    818 log = /var/log/sphinxsearch/searchd.log
    819
    820 # query log file, all search queries are logged here
    821 # optional, default is empty (do not log queries)
    822 query_log = /var/log/sphinxsearch/sphinx-query.log
    823
    824 # client read timeout, seconds
    825 # optional, default is 5
    826 read_timeout = 5
    827
    828 # request timeout, seconds
    829 # optional, default is 5 minutes
    830 client_timeout = 300
    831
    832 # maximum amount of children to fork (concurrent searches to run)
    833 # optional, default is 0 (unlimited)
    834 max_children = 30
    835
    836 # maximum amount of persistent connections from this master to each agent host
    837 # optional, but necessary if you use agent_persistent. It is reasonable to set the value
    838 # as max_children, or less on the agent's hosts.
    839 persistent_connections_limit = 30
    840
    841 # PID file, searchd process ID file name
    842 # mandatory
    843 pid_file = /var/run/sphinxsearch/searchd.pid
    844
    845 # seamless rotate, prevents rotate stalls if precaching huge datasets
    846 # optional, default is 1
    847 seamless_rotate = 1
    848
    849 # whether to forcibly preopen all indexes on startup
    850 # optional, default is 1 (preopen everything)
    851 preopen_indexes = 1
    852
    853 # whether to unlink .old index copies on succesful rotation.
    854 # optional, default is 1 (do unlink)
    855 unlink_old = 1
    856
    857 # attribute updates periodic flush timeout, seconds
    858 # updates will be automatically dumped to disk this frequently
    859 # optional, default is 0 (disable periodic flush)
    860 #
    861 # attr_flush_period = 900
    862
    863
    864 # MVA updates pool size
    865 # shared between all instances of searchd, disables attr flushes!
    866 # optional, default size is 1M
    867 mva_updates_pool = 1M
    868
    869 # max allowed network packet size
    870 # limits both query packets from clients, and responses from agents
    871 # optional, default size is 8M
    872 max_packet_size = 8M
    873
    874 # max allowed per-query filter count
    875 # optional, default is 256
    876 max_filters = 256
    877
    878 # max allowed per-filter values count
    879 # optional, default is 4096
    880 max_filter_values = 4096
    881
    882
    883 # socket listen queue length
    884 # optional, default is 5
    885 #
    886 # listen_backlog = 5
    887
    888
    889 # per-keyword read buffer size
    890 # optional, default is 256K
    891 #
    892 # read_buffer = 256K
    893
    894
    895 # unhinted read size (currently used when reading hits)
    896 # optional, default is 32K
    897 #
    898 # read_unhinted = 32K
    899
    900
    901 # max allowed per-batch query count (aka multi-query count)
    902 # optional, default is 32
    903 max_batch_queries = 32
    904
    905
    906 # max common subtree document cache size, per-query
    907 # optional, default is 0 (disable subtree optimization)
    908 #
    909 # subtree_docs_cache = 4M
    910
    911
    912 # max common subtree hit cache size, per-query
    913 # optional, default is 0 (disable subtree optimization)
    914 #
    915 # subtree_hits_cache = 8M
    916
    917
    918 # multi-processing mode (MPM)
    919 # known values are none, fork, prefork, and threads
    920 # threads is required for RT backend to work
    921 # optional, default is threads
    922 workers = threads # for RT to work
    923
    924
    925 # max threads to create for searching local parts of a distributed index
    926 # optional, default is 0, which means disable multi-threaded searching
    927 # should work with all MPMs (ie. does NOT require workers=threads)
    928 #
    929 # dist_threads = 4
    930
    931
    932 # binlog files path; use empty string to disable binlog
    933 # optional, default is build-time configured data directory
    934 #
    935 # binlog_path = # disable logging
    936 # binlog_path = /var/db/sphinxsearch/data # binlog.001 etc will be created there
    937
    938
    939 # binlog flush/sync mode
    940 # 0 means flush and sync every second
    941 # 1 means flush and sync every transaction
    942 # 2 means flush every transaction, sync every second
    943 # optional, default is 2
    944 #
    945 # binlog_flush = 2
    946
    947
    948 # binlog per-file size limit
    949 # optional, default is 128M, 0 means no limit
    950 #
    951 # binlog_max_log_size = 256M
    952
    953
    954 # per-thread stack size, only affects workers=threads mode
    955 # optional, default is 64K
    956 #
    957 # thread_stack = 128K
    958
    959
    960 # per-keyword expansion limit (for dict=keywords prefix searches)
    961 # optional, default is 0 (no limit)
    962 #
    963 # expansion_limit = 1000
    964
    965
    966 # RT RAM chunks flush period
    967 # optional, default is 0 (no periodic flush)
    968 #
    969 # rt_flush_period = 900
    970
    971
    972 # query log file format
    973 # optional, known values are plain and sphinxql, default is plain
    974 #
    975 # query_log_format = sphinxql
    976
    977
    978 # version string returned to MySQL network protocol clients
    979 # optional, default is empty (use Sphinx version)
    980 #
    981 # mysql_version_string = 5.0.37
    982
    983
    984 # default server-wide collation
    985 # optional, default is libc_ci
    986 #
    987 # collation_server = utf8_general_ci
    988
    989
    990 # server-wide locale for libc based collations
    991 # optional, default is C
    992 #
    993 # collation_libc_locale = ru_RU.UTF-8
    994
    995
    996 # threaded server watchdog (only used in workers=threads mode)
    997 # optional, values are 0 and 1, default is 1 (watchdog on)
    998 #
    999 # watchdog = 1
    1000
    1001
    1002 # costs for max_predicted_time model, in (imaginary) nanoseconds
    1003 # optional, default is "doc=64, hit=48, skip=2048, match=64"
    1004 #
    1005 # predicted_time_costs = doc=64, hit=48, skip=2048, match=64
    1006
    1007
    1008 # current SphinxQL state (uservars etc) serialization path
    1009 # optional, default is none (do not serialize SphinxQL state)
    1010 #
    1011 # sphinxql_state = sphinxvars.sql
    1012
    1013
    1014 # maximum RT merge thread IO calls per second, and per-call IO size
    1015 # useful for throttling (the background) OPTIMIZE INDEX impact
    1016 # optional, default is 0 (unlimited)
    1017 #
    1018 # rt_merge_iops = 40
    1019 # rt_merge_maxiosize = 1M
    1020
    1021
    1022 # interval between agent mirror pings, in milliseconds
    1023 # 0 means disable pings
    1024 # optional, default is 1000
    1025 #
    1026 # ha_ping_interval = 0
    1027
    1028
    1029 # agent mirror statistics window size, in seconds
    1030 # stats older than the window size (karma) are retired
    1031 # that is, they will not affect master choice of agents in any way
    1032 # optional, default is 60 seconds
    1033 #
    1034 # ha_period_karma = 60
    1035
    1036
    1037 # delay between preforked children restarts on rotation, in milliseconds
    1038 # optional, default is 0 (no delay)
    1039 #
    1040 # prefork_rotation_throttle = 100
    1041
    1042
    1043 # a prefix to prepend to the local file names when creating snippets
    1044 # with load_files and/or load_files_scatter options
    1045 # optional, default is empty
    1046 #
    1047 # snippets_file_prefix = /mnt/common/server1/
    1048 }
    1049
    1050 #############################################################################
    1051 ## common settings
    1052 #############################################################################
    1053
    1054 common
    1055 {
    1056
    1057 # lemmatizer dictionaries base path
    1058 # optional, defaut is /usr/local/share (see ./configure --datadir)
    1059 #
    1060 # lemmatizer_base = /usr/local/share/sphinx/dicts
    1061
    1062
    1063 # how to handle syntax errors in JSON attributes
    1064 # known values are 'ignore_attr' and 'fail_index'
    1065 # optional, default is 'ignore_attr'
    1066 #
    1067 # on_json_attr_error = fail_index
    1068
    1069
    1070 # whether to auto-convert numeric values from strings in JSON attributes
    1071 # with auto-conversion, string value with actually numeric data
    1072 # (as in {"key":"12345"}) gets stored as a number, rather than string
    1073 # optional, allowed values are 0 and 1, default is 0 (do not convert)
    1074 #
    1075 # json_autoconv_numbers = 1
    1076
    1077
    1078 # whether and how to auto-convert key names in JSON attributes
    1079 # known value is 'lowercase'
    1080 # optional, default is unspecified (do nothing)
    1081 #
    1082 # json_autoconv_keynames = lowercase
    1083
    1084
    1085 # path to RLP root directory
    1086 # optional, defaut is /usr/local/share (see ./configure --datadir)
    1087 #
    1088 # rlp_root = /usr/local/share/sphinx/rlp
    1089
    1090
    1091 # path to RLP environment file
    1092 # optional, defaut is /usr/local/share/rlp-environment.xml (see ./configure --datadir)
    1093 #
    1094 # rlp_environment = /usr/local/share/sphinx/rlp/rlp/etc/rlp-environment.xml
    1095
    1096
    1097 # maximum total size of documents batched before processing them by the RLP
    1098 # optional, default is 51200
    1099 #
    1100 # rlp_max_batch_size = 100k
    1101
    1102
    1103 # maximum number of documents batched before processing them by the RLP
    1104 # optional, default is 50
    1105 #
    1106 # rlp_max_batch_docs = 100
    1107
    1108
    1109 # trusted plugin directory
    1110 # optional, default is empty (disable UDFs)
    1111 #
    1112 # plugin_dir = /usr/local/sphinx/lib
    1113
    1114 }
    1115
    1116 # --eof--
  • utf8/plugins/sphinx/lib/sphinx/Keeper.pm

     
    4 4 use warnings 'all';
    5 5 use base qw(Contenido::Keeper);
    6 6 use Contenido::Globals;
    7 use Data::Dumper;
    7 8
    8 9 ######################
    9 10 # Отправить объект в поиск:
     
    26 27 my $doc = shift;
    27 28 return undef unless ref $doc && $doc->id;
    28 29
    29 my ($object) = $self->get_documents(
    30 my ($object) = $keeper->get_documents(
    30 31 class => 'sphinx::Search',
    31 32 object_id => $doc->id,
    32 33 object_class => $doc->class,
     
    35 36 my $data = $doc->get_search_data;
    36 37 return undef unless $data;
    37 38 unless ( ref $object ) {
    38 $object = sphinx::Search->new( $self );
    39 $object = sphinx::Search->new( $keeper );
    39 40 $object->status( 1 );
    40 41 $object->is_deleted( 0 );
    41 42 $object->object_id( $doc->id );
     
    62 63 }
    63 64
    64 65
    66 # Методы поиска
    67 ####################################################################
    68 sub search {
    69 my $self = shift;
    70 my $text = shift;
    71 return unless $text;
    72 my (%opts) = @_;
    73
    74 my $result;
    75 my $db_table = delete $opts{db_table} || $self->state->table_name;
    76 my @wheres = ("MATCH(?)");
    77 my @values = ($text);
    78 my $count = delete $opts{count};
    79 my $limit = delete $opts{limit};
    80 return if $limit && ($limit =~ /\D/ || $limit < 0);
    81 my $no_limit = delete $opts{no_limit};
    82 unless ( $no_limit ) {
    83 $limit ||= 1000;
    84 }
    85 my $offset = delete $opts{offset};
    86 return if $offset && ($offset =~ /\D/ || $offset < 0);
    87 my $return_value = delete $opts{return_value} || 'array_ref';
    88 my $hash_by = delete $opts{hash_by} || 'object_id';
    89
    90 while ( my ($key, $val) = each %opts ) {
    91 if ( ref $val eq 'ARRAY' ) {
    92 push @wheres, "$key in (".join(',', map { '?' } @$val).")";
    93 push @values, @$val;
    94 } else {
    95 push @wheres, "$key = ?";
    96 push @values, $val;
    97 }
    98 }
    99 my $query = "select ".($count ? 'count(*) as cnt' : '*, weight() as weight')." from $db_table where ".join( ' and ', @wheres );
    100 if ( $limit ) {
    101 $query .= " limit $limit ";
    102 }
    103 if ( $offset ) {
    104 $query .= " offset $offset ";
    105 }
    106 warn "SEARCH QUERY: $query\n" if $DEBUG;
    107 warn "SEARCH VALUES: ".Dumper( \@values ) if $DEBUG;
    108 my $sth = $self->SQL->prepare_cached( $query );
    109 $sth->execute( @values );
    110 if ( $count ) {
    111 $result = $sth->fetchrow_arrayref;
    112 $result = $result->[0];
    113 } else {
    114 $result = [];
    115 while ( my $row = $sth->fetchrow_hashref ) {
    116 push @$result, $row;
    117 }
    118 }
    119 return $result;
    120 }
    121
    122 sub stemmed {
    123 my $self = shift;
    124 my $db_table = $self->state->table_name_stemmed;
    125 return $self->search( @_, db_table => $db_table );
    126 }
    127
    128 # МЕТОДЫ ДОСТУПА К СОЕДИНЕНИЯМ С БАЗОЙ УМНЫЕ
    129 ####################################################################
    130 # получение соединения с базой или установка нового если его не было
    131 sub SQL {
    132 my $self = shift;
    133 return ($self->connect_check() ? $self->{SQL} : undef);
    134 }
    135
    136 # -------------------------------------------------------------------------------------------------
    137 # Открываем соединение с базой данных
    138 # -------------------------------------------------------------------------------------------------
    139 sub connect {
    140 my $self = shift;
    141 #соединение уже есть
    142 if ($self->is_connected) {
    143 } else {
    144 unless ($self->{SQL} = $self->db_connect) {
    145 warn "Не могу соединиться с базой данных";
    146 die;
    147 }
    148 $self->{SQL}->do("SET NAMES '".$self->state->db_client_encoding."'") if ($self->state->db_client_encoding);
    149 }
    150
    151 $self->{_connect_ok} = 1;
    152 return 1;
    153 }
    154
    155 #проверка соединения с базой кеширующая состояние соединения
    156 sub connect_check {
    157 my $self = shift;
    158 return 1 if ($self->{_connect_ok});
    159 if ($self->is_connected) {
    160 $self->{_connect_ok} = 1;
    161 return 1;
    162 } else {
    163 if ($self->connect) {
    164 return 1;
    165 } else {
    166 #сюда по логике попадать не должно так как die вылететь должен
    167 warn "Connect failed\n";
    168 return 0;
    169 }
    170 }
    171 }
    172
    173 sub db_connect {
    174 my $self = shift;
    175 my $dbh = DBI->connect('DBI:mysql:host='.$self->{db_host}.';port='.$self->{db_port}.';mysql_enable_utf8=1')
    176 || die "Contenido Error: Не могу соединиться с Sphinx базой данных\n";
    177
    178 # $dbh->{'AutoCommit'} = 1;
    179 # $dbh->{mysql_auto_reconnect} = 1;
    180
    181 return $dbh;
    182 }
    183
    184 sub is_connected {
    185 my $self = shift;
    186 if ( ref $self->{SQL} and $self->{SQL}->can('ping') and $self->{SQL}->ping() ) {
    187 $self->{_connect_ok} = 1;
    188 return 1;
    189 } else {
    190 $self->{_connect_ok} = 0;
    191 return 0;
    192 }
    193
    194 # warn 'Check if MySQL DB connected: '.(ref $self && exists $self->{SQL} && ref $self->{SQL} ? 1 : 0 ) if $DEBUG;
    195 # return ( ref($self) && exists $self->{SQL} && ref $self->{SQL} );
    196 }
    65 197 1;
  • utf8/plugins/sphinx/lib/sphinx/State.pm.proto

     
    12 12 bless $self, $class;
    13 13
    14 14 # configured
    15 $self->{debug} = (lc('') eq 'yes');
    16 $self->{project} = '';
    17 $self->{contenido_notab} = 0;
    15 $self->{debug} = (lc('@DEBUG@') eq 'yes');
    16 $self->{project} = '@PROJECT@';
    17 $self->{contenido_notab} = 1;
    18 18 $self->{tab_name} = 'sphinx';
    19 19
    20 20 # зашитая конфигурация плагина
    21 $self->{db_type} = 'none'; ### For REAL database use 'remote'
    22 $self->{db_keepalive} = 0;
    23 $self->{db_host} = '';
    21 $self->{db_type} = 'remote'; ### For REAL database use 'remote'
    22 $self->{db_keepalive} = 0;
    23 $self->{db_host} = '@SPHINX_HOST@';
    24 24 $self->{db_name} = '';
    25 25 $self->{db_user} = '';
    26 $self->{db_password} = '';
    27 $self->{db_port} = '';
    26 $self->{db_password} = '';
    27 $self->{db_port} = '@SPHINX_PORT@';
    28 28 $self->{store_method} = 'toast';
    29 29 $self->{cascade} = 1;
    30 30 $self->{db_prepare} = 0;
    31 31
    32 $self->{memcached_enable} = lc( '' ) eq 'yes' ? 1 : 0;
    32 $self->{memcached_enable} = lc( '@MEMCACHED_ENABLE@' ) eq 'yes' ? 1 : 0;
    33 33 $self->{memcached_enable_compress} = 1;
    34 $self->{memcached_backend} = '';
    35 $self->{memcached_servers} = [qw()];
    34 $self->{memcached_backend} = '@MEMCACHED_BACKEND@';
    35 $self->{memcached_servers} = [qw(@MEMCACHED_SERVERS@)];
    36 36 $self->{memcached_busy_lock} = 60;
    37 $self->{memcached_delayed} = lc('') eq 'yes' ? 1 : 0;
    37 $self->{memcached_delayed} = lc('@MEMCACHED_DELAYED@') eq 'yes' ? 1 : 0;
    38 38
    39 39 $self->{serialize_with} = 'json'; ### or 'dumper'
    40 40
     
    44 44 $self->{images_directory} = '/nonexistent';
    45 45 $self->{preview} = '0';
    46 46
    47 $self->{table_name} = '@SPHINX_TABLE@';
    48 $self->{table_name_stemmed} = '@SPHINX_TABLE_STEMMED@';
    49
    47 50 $self->_init_();
    48 51 $self;
    49 52 }
     
    90 93 data_directory
    91 94 images_directory
    92 95 preview
    96
    97 table_name
    98 table_name_stemmed
    93 99 );
    94 100 }
    95 101

Небольшая справка по веткам

cnddist – контейнер, в котором хранятся все дистрибутивы всех библиотек и программных пакетов, которые использовались при построении различных версий Contenido. Если какой-то библиотеки в данном хранилище нет, инсталлятор сделает попытку "подтянуть" ее с веба (например, с CPAN). Если библиотека слишком старая, есть очень большая вероятность, что ее там уже нет. Поэтому мы храним весь хлам от всех сборок. Если какой-то дистрибутив вдруг отсутствует в cnddist - напишите нам, мы положим его туда.

koi8 – отмирающая ветка, чей код, выдача и все внутренние библиотеки заточены на кодировку KOI8-R. Вносятся только те дополнения, которые касаются внешнего вида и функционала админки, баги ядра, обязательные обновления портов и мелочи, которые легко скопипастить. В дальнейшем планируется полная остановка поддержки по данной ветке.

utf8 – актуальная ветка, заточенная под UTF-8.

Внутри каждой ветки: core – исходники ядра; install – скрипт установки инсталляции; plugins – плагины; samples – "готовые к употреблению" проекты, которые можно поставить, запустить и посмотреть, как они работают.