Revision 603
Date:
2016/10/24 15:31:01
Author:
ahitrov
Revision Log:
Sample sphinx.conf, search methods
Files:
Legend:
Added
Removed
Modified
utf8/plugins/sphinx/config.proto
7
7
8
8
PLUGINS += sphinx
9
9
10
SPHINX_HOST = localhost
10
SPHINX_HOST = 127.0.0.1
11
11
SPHINX_PORT = 9306
12
SPHINX_DATA = /path/to/sphinx/database/
13
SPHINX_TABLE = indextable
14
SPHINX_TABLE_STEMMED = indextablestemmed
12
15
13
REWRITE += SPHINX_HOST SPHINX_PORT
16
REWRITE += SPHINX_HOST SPHINX_PORT SPHINX_DATA SPHINX_TABLE SPHINX_TABLE_STEMMED
utf8/plugins/sphinx/etc/sphinx.conf
1
#
2
# Sphinx configuration file sample
3
#
4
# WARNING! While this sample file mentions all available options,
5
# it contains (very) short helper descriptions only. Please refer to
6
# doc/sphinx.html for details.
7
#
8
9
#############################################################################
10
## data source definition
11
#############################################################################
12
13
source zvukiru
14
{
15
# data source type. mandatory, no default value
16
# known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
17
type = pgsql
18
19
#####################################################################
20
## SQL settings (for 'mysql' and 'pgsql' types)
21
#####################################################################
22
23
# some straightforward parameters for SQL source types
24
sql_host = localhost
25
sql_user = zvuki
26
sql_pass = sarUchOov
27
sql_db = zvukirutf
28
sql_port = 5432 # optional, default is 3306
29
30
# UNIX socket name
31
# optional, default is empty (reuse client library defaults)
32
# usually '/var/lib/mysql/mysql.sock' on Linux
33
# usually '/tmp/mysql.sock' on FreeBSD
34
#
35
# sql_sock = /tmp/mysql.sock
36
37
38
# MySQL specific client connection flags
39
# optional, default is 0
40
#
41
# mysql_connect_flags = 32 # enable compression
42
43
# MySQL specific SSL certificate settings
44
# optional, defaults are empty
45
#
46
# mysql_ssl_cert = /etc/ssl/client-cert.pem
47
# mysql_ssl_key = /etc/ssl/client-key.pem
48
# mysql_ssl_ca = /etc/ssl/cacert.pem
49
50
# MS SQL specific Windows authentication mode flag
51
# MUST be in sync with charset_type index-level setting
52
# optional, default is 0
53
#
54
# mssql_winauth = 1 # use currently logged on user credentials
55
56
57
# ODBC specific DSN (data source name)
58
# mandatory for odbc source type, no default value
59
#
60
# odbc_dsn = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
61
# sql_query = SELECT id, data FROM documents.csv
62
63
64
# ODBC and MS SQL specific, per-column buffer sizes
65
# optional, default is auto-detect
66
#
67
# sql_column_buffers = content=12M, comments=1M
68
69
70
# pre-query, executed before the main fetch query
71
# multi-value, optional, default is empty list of queries
72
#
73
# sql_query_pre = SET NAMES utf8
74
# sql_query_pre = SET SESSION query_cache_type=OFF
75
76
77
# main document fetch query
78
# mandatory, integer document ID field MUST be the first selected column
79
sql_query = \
80
SELECT id, object_id, object_class, extract(epoch from date_trunc('seconds', mtime)) AS last_edited, is_deleted, name as title, search as content \
81
FROM search
82
83
84
# joined/payload field fetch query
85
# joined fields let you avoid (slow) JOIN and GROUP_CONCAT
86
# payload fields let you attach custom per-keyword values (eg. for ranking)
87
#
88
# syntax is FIELD-NAME 'from' ( 'query' | 'payload-query' ); QUERY
89
# joined field QUERY should return 2 columns (docid, text)
90
# payload field QUERY should return 3 columns (docid, keyword, weight)
91
#
92
# REQUIRES that query results are in ascending document ID order!
93
# multi-value, optional, default is empty list of queries
94
#
95
# sql_joined_field = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
96
# sql_joined_field = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC
97
98
99
# file based field declaration
100
#
101
# content of this field is treated as a file name
102
# and the file gets loaded and indexed in place of a field
103
#
104
# max file size is limited by max_file_field_buffer indexer setting
105
# file IO errors are non-fatal and get reported as warnings
106
#
107
# sql_file_field = content_file_path
108
109
110
# range query setup, query that must return min and max ID values
111
# optional, default is empty
112
#
113
# sql_query will need to reference $start and $end boundaries
114
# if using ranged query:
115
#
116
# sql_query = \
117
# SELECT doc.id, doc.id AS group, doc.title, doc.data \
118
# FROM documents doc \
119
# WHERE id>=$start AND id<=$end
120
#
121
# sql_query_range = SELECT MIN(id),MAX(id) FROM documents
122
123
124
# range query step
125
# optional, default is 1024
126
#
127
sql_range_step = 1000
128
129
130
# unsigned integer attribute declaration
131
# multi-value (an arbitrary number of attributes is allowed), optional
132
# optional bit size can be specified, default is 32
133
#
134
# sql_attr_uint = author_id
135
# sql_attr_uint = forum_id:9 # 9 bits for forum_id
136
sql_attr_uint = object_id
137
138
# boolean attribute declaration
139
# multi-value (an arbitrary number of attributes is allowed), optional
140
# equivalent to sql_attr_uint with 1-bit size
141
#
142
sql_attr_bool = is_deleted
143
144
145
# bigint attribute declaration
146
# multi-value (an arbitrary number of attributes is allowed), optional
147
# declares a signed (unlike uint!) 64-bit attribute
148
#
149
# sql_attr_bigint = my_bigint_id
150
151
152
# UNIX timestamp attribute declaration
153
# multi-value (an arbitrary number of attributes is allowed), optional
154
# similar to integer, but can also be used in date functions
155
#
156
# sql_attr_timestamp = posted_ts
157
sql_attr_timestamp = last_edited
158
# sql_attr_timestamp = date_added
159
160
161
# floating point attribute declaration
162
# multi-value (an arbitrary number of attributes is allowed), optional
163
# values are stored in single precision, 32-bit IEEE 754 format
164
#
165
# sql_attr_float = lat_radians
166
# sql_attr_float = long_radians
167
168
169
# multi-valued attribute (MVA) attribute declaration
170
# multi-value (an arbitrary number of attributes is allowed), optional
171
# MVA values are variable length lists of unsigned 32-bit integers
172
#
173
# syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
174
# ATTR-TYPE is 'uint' or 'timestamp'
175
# SOURCE-TYPE is 'field', 'query', or 'ranged-query'
176
# QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
177
# RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
178
#
179
# sql_attr_multi = uint tag from query; SELECT docid, tagid FROM tags
180
# sql_attr_multi = uint tag from ranged-query; \
181
# SELECT docid, tagid FROM tags WHERE id>=$start AND id<=$end; \
182
# SELECT MIN(docid), MAX(docid) FROM tags
183
184
185
# string attribute declaration
186
# multi-value (an arbitrary number of these is allowed), optional
187
# lets you store and retrieve strings
188
#
189
sql_attr_string = object_class
190
191
192
# JSON attribute declaration
193
# multi-value (an arbitrary number of these is allowed), optional
194
# lets you store a JSON document as an (in-memory) attribute for later use
195
#
196
# sql_attr_json = properties
197
198
199
# combined field plus attribute declaration (from a single column)
200
# stores column as an attribute, but also indexes it as a full-text field
201
#
202
# sql_field_string = author
203
204
205
# post-query, executed on sql_query completion
206
# optional, default is empty
207
#
208
# sql_query_post =
209
210
211
# post-index-query, executed on successful indexing completion
212
# optional, default is empty
213
# $maxid expands to max document ID actually fetched from DB
214
#
215
# sql_query_post_index = REPLACE INTO counters ( id, val ) \
216
# VALUES ( 'max_indexed_id', $maxid )
217
218
219
# ranged query throttling, in milliseconds
220
# optional, default is 0 which means no delay
221
# enforces given delay before each query step
222
sql_ranged_throttle = 0
223
224
225
# kill-list query, fetches the document IDs for kill-list
226
# k-list will suppress matches from preceding indexes in the same query
227
# optional, default is empty
228
#
229
# sql_query_killlist = SELECT id FROM documents WHERE edited>=@last_reindex
230
231
232
# columns to unpack on indexer side when indexing
233
# multi-value, optional, default is empty list
234
#
235
# unpack_zlib = zlib_column
236
# unpack_mysqlcompress = compressed_column
237
# unpack_mysqlcompress = compressed_column_2
238
239
240
# maximum unpacked length allowed in MySQL COMPRESS() unpacker
241
# optional, default is 16M
242
#
243
# unpack_mysqlcompress_maxsize = 16M
244
245
246
# hook command to run when SQL connection succeeds
247
# optional, default value is empty (do nothing)
248
#
249
# hook_connect = bash sql_connect.sh
250
251
252
# hook command to run after (any) SQL range query
253
# it may print out "minid maxid" (w/o quotes) to override the range
254
# optional, default value is empty (do nothing)
255
#
256
# hook_query_range = bash sql_query_range.sh
257
258
259
# hook command to run on successful indexing completion
260
# $maxid expands to max document ID actually fetched from DB
261
# optional, default value is empty (do nothing)
262
#
263
# hook_post_index = bash sql_post_index.sh $maxid
264
265
#####################################################################
266
## xmlpipe2 settings
267
#####################################################################
268
269
# type = xmlpipe
270
271
# shell command to invoke xmlpipe stream producer
272
# mandatory
273
#
274
# xmlpipe_command = cat /var/db/sphinxsearch/test.xml
275
276
# xmlpipe2 field declaration
277
# multi-value, optional, default is empty
278
#
279
# xmlpipe_field = subject
280
# xmlpipe_field = content
281
282
283
# xmlpipe2 attribute declaration
284
# multi-value, optional, default is empty
285
# all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
286
# examples:
287
#
288
# xmlpipe_attr_timestamp = published
289
# xmlpipe_attr_uint = author_id
290
# xmlpipe_attr_bool = is_enabled
291
# xmlpipe_attr_float = latitude
292
# xmlpipe_attr_bigint = guid
293
# xmlpipe_attr_multi = tags
294
# xmlpipe_attr_multi_64 = tags64
295
# xmlpipe_attr_string = title
296
# xmlpipe_attr_json = extra_data
297
# xmlpipe_field_string = content
298
299
300
# perform UTF-8 validation, and filter out incorrect codes
301
# avoids XML parser choking on non-UTF-8 documents
302
# optional, default is 0
303
#
304
# xmlpipe_fixup_utf8 = 1
305
}
306
307
308
# inherited source example
309
#
310
# all the parameters are copied from the parent source,
311
# and may then be overridden in this source definition
312
source zvukiruthrottled : zvukiru
313
{
314
sql_ranged_throttle = 100
315
}
316
317
#############################################################################
318
## index definition
319
#############################################################################
320
321
# local index example
322
#
323
# this is an index which is stored locally in the filesystem
324
#
325
# all indexing-time options (such as morphology and charsets)
326
# are configured per local index
327
index zvukiru
328
{
329
# index type
330
# optional, default is 'plain'
331
# known values are 'plain', 'distributed', and 'rt' (see samples below)
332
# type = plain
333
334
# document source(s) to index
335
# multi-value, mandatory
336
# document IDs must be globally unique across all sources
337
source = zvukiru
338
339
# index files path and file name, without extension
340
# mandatory, path must be writable, extensions will be auto-appended
341
path = /var/db/sphinxsearch/data/zvukiru
342
343
# document attribute values (docinfo) storage mode
344
# optional, default is 'extern'
345
# known values are 'none', 'extern' and 'inline'
346
docinfo = extern
347
348
# dictionary type, 'crc' or 'keywords'
349
# crc is faster to index when no substring/wildcards searches are needed
350
# crc with substrings might be faster to search but is much slower to index
351
# (because all substrings are pre-extracted as individual keywords)
352
# keywords is much faster to index with substrings, and index is much (3-10x) smaller
353
# keywords supports wildcards, crc does not, and never will
354
# optional, default is 'keywords'
355
dict = keywords
356
357
# memory locking for cached data (.spa and .spi), to prevent swapping
358
# optional, default is 0 (do not mlock)
359
# requires searchd to be run from root
360
mlock = 0
361
362
# a list of morphology preprocessors to apply
363
# optional, default is empty
364
#
365
# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
366
# 'soundex', and 'metaphone'; additional preprocessors available from
367
# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
368
# (see libstemmer_c/libstemmer/modules.txt)
369
#
370
morphology = stem_en, stem_ru, soundex
371
# morphology = libstemmer_german
372
# morphology = libstemmer_sv
373
# morphology = none
374
375
# minimum word length at which to enable stemming
376
# optional, default is 1 (stem everything)
377
#
378
min_stemming_len = 2
379
380
381
# stopword files list (space separated)
382
# optional, default is empty
383
# contents are plain text, charset_table and stemming are both applied
384
#
385
# stopwords = /var/db/sphinxsearch/data/stopwords.txt
386
387
388
# wordforms file, in "mapfrom > mapto" plain text format
389
# optional, default is empty
390
#
391
# wordforms = /var/db/sphinxsearch/data/wordforms.txt
392
393
394
# tokenizing exceptions file
395
# optional, default is empty
396
#
397
# plain text, case sensitive, space insensitive in map-from part
398
# one "Map Several Words => ToASingleOne" entry per line
399
#
400
# exceptions = /var/db/sphinxsearch/data/exceptions.txt
401
402
403
# embedded file size limit
404
# optional, default is 16K
405
#
406
# exceptions, wordforms, and stopwords files smaller than this limit
407
# are stored in the index; otherwise, their paths and sizes are stored
408
#
409
# embedded_limit = 16K
410
411
# minimum indexed word length
412
# default is 1 (index everything)
413
min_word_len = 2
414
415
416
# ignored characters list
417
# optional, default value is empty
418
#
419
# ignore_chars = U+00AD
420
421
422
# minimum word prefix length to index
423
# optional, default is 0 (do not index prefixes)
424
#
425
# min_prefix_len = 0
426
427
428
# minimum word infix length to index
429
# optional, default is 0 (do not index infixes)
430
#
431
# min_infix_len = 0
432
433
434
# maximum substring (prefix or infix) length to index
435
# optional, default is 0 (do not limit substring length)
436
#
437
# max_substring_len = 8
438
439
440
# list of fields to limit prefix/infix indexing to
441
# optional, default value is empty (index all fields in prefix/infix mode)
442
#
443
# prefix_fields = filename
444
# infix_fields = url, domain
445
446
447
# expand keywords with exact forms and/or stars when searching fit indexes
448
# search-time only, does not affect indexing, can be 0 or 1
449
# optional, default is 0 (do not expand keywords)
450
#
451
# expand_keywords = 1
452
453
454
# n-gram length to index, for CJK indexing
455
# only supports 0 and 1 for now, other lengths to be implemented
456
# optional, default is 0 (disable n-grams)
457
#
458
# ngram_len = 1
459
460
461
# n-gram characters list, for CJK indexing
462
# optional, default is empty
463
#
464
# ngram_chars = U+3000..U+2FA1F
465
466
467
# phrase boundary characters list
468
# optional, default is empty
469
#
470
# phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis
471
472
473
# phrase boundary word position increment
474
# optional, default is 0
475
#
476
# phrase_boundary_step = 100
477
478
479
# blended characters list
480
# blended chars are indexed both as separators and valid characters
481
# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
482
# optional, default is empty
483
#
484
# blend_chars = +, &, U+23
485
486
487
# blended token indexing mode
488
# a comma separated list of blended token indexing variants
489
# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
490
# optional, default is trim_none
491
#
492
# blend_mode = trim_tail, skip_pure
493
494
495
# whether to strip HTML tags from incoming documents
496
# known values are 0 (do not strip) and 1 (do strip)
497
# optional, default is 0
498
html_strip = 0
499
500
# what HTML attributes to index if stripping HTML
501
# optional, default is empty (do not index anything)
502
#
503
# html_index_attrs = img=alt,title; a=title;
504
505
506
# what HTML elements contents to strip
507
# optional, default is empty (do not strip element contents)
508
#
509
# html_remove_elements = style, script
510
511
512
# whether to preopen index data files on startup
513
# optional, default is 0 (do not preopen), searchd-only
514
#
515
# preopen = 1
516
517
518
# whether to enable in-place inversion (2x less disk, 90-95% speed)
519
# optional, default is 0 (use separate temporary files), indexer-only
520
#
521
# inplace_enable = 1
522
523
524
# in-place fine-tuning options
525
# optional, defaults are listed below
526
#
527
# inplace_hit_gap = 0 # preallocated hitlist gap size
528
# inplace_docinfo_gap = 0 # preallocated docinfo gap size
529
# inplace_reloc_factor = 0.1 # relocation buffer size within arena
530
# inplace_write_factor = 0.1 # write buffer size within arena
531
532
533
# whether to index original keywords along with stemmed versions
534
# enables "=exactform" operator to work
535
# optional, default is 0
536
#
537
# index_exact_words = 1
538
539
540
# position increment on overshort (less that min_word_len) words
541
# optional, allowed values are 0 and 1, default is 1
542
#
543
# overshort_step = 1
544
545
546
# position increment on stopword
547
# optional, allowed values are 0 and 1, default is 1
548
#
549
# stopword_step = 1
550
551
552
# hitless words list
553
# positions for these keywords will not be stored in the index
554
# optional, allowed values are 'all', or a list file name
555
#
556
# hitless_words = all
557
# hitless_words = hitless.txt
558
559
560
# detect and index sentence and paragraph boundaries
561
# required for the SENTENCE and PARAGRAPH operators to work
562
# optional, allowed values are 0 and 1, default is 0
563
#
564
# index_sp = 1
565
566
567
# index zones, delimited by HTML/XML tags
568
# a comma separated list of tags and wildcards
569
# required for the ZONE operator to work
570
# optional, default is empty string (do not index zones)
571
#
572
# index_zones = title, h*, th
573
574
575
# index per-document and average per-index field lengths, in tokens
576
# required for the BM25A(), BM25F() in expression ranker
577
# optional, default is 0 (do not index field lenghts)
578
#
579
# index_field_lengths = 1
580
581
582
# regular expressions (regexps) to filter the fields and queries with
583
# gets applied to data source fields when indexing
584
# gets applied to search queries when searching
585
# multi-value, optional, default is empty list of regexps
586
#
587
# regexp_filter = \b(\d+)\" => \1inch
588
# regexp_filter = (blue|red) => color
589
590
591
# list of the words considered frequent with respect to bigram indexing
592
# optional, default is empty
593
#
594
# bigram_freq_words = the, a, i, you, my
595
596
597
# bigram indexing mode
598
# known values are none, all, first_freq, both_freq
599
# option, default is none (do not index bigrams)
600
#
601
# bigram_index = both_freq
602
603
604
# snippet document file name prefix
605
# preprended to file names when generating snippets using load_files option
606
# WARNING, this is a prefix (not a path), trailing slash matters!
607
# optional, default is empty
608
#
609
# snippets_file_prefix = /mnt/mydocs/server1
610
611
612
# whether to apply stopwords before or after stemming
613
# optional, default is 0 (apply stopwords after stemming)
614
#
615
# stopwords_unstemmed = 0
616
617
618
# path to a global (cluster-wide) keyword IDFs file
619
# optional, default is empty (use local IDFs)
620
#
621
# global_idf = /usr/local/sphinx/var/global.idf
622
}
623
624
625
# inherited index example
626
#
627
# all the parameters are copied from the parent index,
628
# and may then be overridden in this index definition
629
index zvukirustemmed : zvukiru
630
{
631
path = /var/db/sphinxsearch/data/zvukirustemmed
632
morphology = stem_en
633
}
634
635
636
# distributed index example
637
#
638
# this is a virtual index which can NOT be directly indexed,
639
# and only contains references to other local and/or remote indexes
640
index dist1
641
{
642
# 'distributed' index type MUST be specified
643
type = distributed
644
645
# local index to be searched
646
# there can be many local indexes configured
647
local = test1
648
local = test1stemmed
649
650
# remote agent
651
# multiple remote agents may be specified
652
# syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
653
# syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
654
agent = localhost:9313:remote1
655
agent = localhost:9314:remote2,remote3
656
# agent = /var/run/searchd.sock:remote4
657
658
# remote agent mirrors groups, aka mirrors, aka HA agents
659
# defines 2 or more interchangeable mirrors for a given index part
660
#
661
# agent = server3:9312 | server4:9312 :indexchunk2
662
# agent = server3:9312:chunk2server3 | server4:9312:chunk2server4
663
# agent = server3:chunk2server3 | server4:chunk2server4
664
# agent = server21|server22|server23:chunk2
665
666
667
# blackhole remote agent, for debugging/testing
668
# network errors and search results will be ignored
669
#
670
# agent_blackhole = testbox:9312:testindex1,testindex2
671
672
673
# persistenly connected remote agent
674
# reduces connect() pressure, requires that workers IS threads
675
#
676
# agent_persistent = testbox:9312:testindex1,testindex2
677
678
679
# remote agent connection timeout, milliseconds
680
# optional, default is 1000 ms, ie. 1 sec
681
agent_connect_timeout = 1000
682
683
# remote agent query timeout, milliseconds
684
# optional, default is 3000 ms, ie. 3 sec
685
agent_query_timeout = 3000
686
687
# HA mirror agent strategy
688
# optional, defaults to ??? (random mirror)
689
# know values are nodeads, noerrors, roundrobin, nodeadstm, noerrorstm
690
#
691
# ha_strategy = nodeads
692
693
# path to RLP context file
694
# optional, defaut is empty
695
#
696
# rlp_context = /usr/local/share/sphinx/rlp/rlp-context.xml
697
}
698
699
700
# realtime index example
701
#
702
# you can run INSERT, REPLACE, and DELETE on this index on the fly
703
# using MySQL protocol (see 'listen' directive below)
704
index rt
705
{
706
# 'rt' index type must be specified to use RT index
707
type = rt
708
709
# index files path and file name, without extension
710
# mandatory, path must be writable, extensions will be auto-appended
711
path = /var/db/sphinxsearch/data/rt
712
713
# RAM chunk size limit
714
# RT index will keep at most this much data in RAM, then flush to disk
715
# optional, default is 128M
716
#
717
# rt_mem_limit = 512M
718
719
# full-text field declaration
720
# multi-value, mandatory
721
rt_field = title
722
rt_field = content
723
724
# unsigned integer attribute declaration
725
# multi-value (an arbitrary number of attributes is allowed), optional
726
# declares an unsigned 32-bit attribute
727
rt_attr_uint = gid
728
729
# RT indexes currently support the following attribute types:
730
# uint, bigint, float, timestamp, string, mva, mva64, json
731
#
732
# rt_attr_bigint = guid
733
# rt_attr_float = gpa
734
# rt_attr_timestamp = ts_added
735
# rt_attr_string = author
736
# rt_attr_multi = tags
737
# rt_attr_multi_64 = tags64
738
# rt_attr_json = extra_data
739
}
740
741
#############################################################################
742
## indexer settings
743
#############################################################################
744
745
indexer
746
{
747
# memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
748
# optional, default is 128M, max is 2047M, recommended is 256M to 1024M
749
mem_limit = 128M
750
751
# maximum IO calls per second (for I/O throttling)
752
# optional, default is 0 (unlimited)
753
#
754
# max_iops = 40
755
756
757
# maximum IO call size, bytes (for I/O throttling)
758
# optional, default is 0 (unlimited)
759
#
760
# max_iosize = 1048576
761
762
763
# maximum xmlpipe2 field length, bytes
764
# optional, default is 2M
765
#
766
# max_xmlpipe2_field = 4M
767
768
769
# write buffer size, bytes
770
# several (currently up to 4) buffers will be allocated
771
# write buffers are allocated in addition to mem_limit
772
# optional, default is 1M
773
#
774
# write_buffer = 1M
775
776
777
# maximum file field adaptive buffer size
778
# optional, default is 8M, minimum is 1M
779
#
780
# max_file_field_buffer = 32M
781
782
783
# how to handle IO errors in file fields
784
# known values are 'ignore_field', 'skip_document', and 'fail_index'
785
# optional, default is 'ignore_field'
786
#
787
# on_file_field_error = skip_document
788
789
790
# lemmatizer cache size
791
# improves the indexing time when the lemmatization is enabled
792
# optional, default is 256K
793
#
794
# lemmatizer_cache = 512M
795
}
796
797
#############################################################################
798
## searchd settings
799
#############################################################################
800
801
searchd
802
{
803
# [hostname:]port[:protocol], or /unix/socket/path to listen on
804
# known protocols are 'sphinx' (SphinxAPI) and 'mysql41' (SphinxQL)
805
#
806
# multi-value, multiple listen points are allowed
807
# optional, defaults are 9312:sphinx and 9306:mysql41, as below
808
#
809
# listen = 127.0.0.1
810
# listen = 192.168.0.1:9312
811
# listen = 9312
812
# listen = /var/run/searchd.sock
813
listen = 9312
814
listen = 9306:mysql41
815
816
# log file, searchd run info is logged here
817
# optional, default is 'searchd.log'
818
log = /var/log/sphinxsearch/searchd.log
819
820
# query log file, all search queries are logged here
821
# optional, default is empty (do not log queries)
822
query_log = /var/log/sphinxsearch/sphinx-query.log
823
824
# client read timeout, seconds
825
# optional, default is 5
826
read_timeout = 5
827
828
# request timeout, seconds
829
# optional, default is 5 minutes
830
client_timeout = 300
831
832
# maximum amount of children to fork (concurrent searches to run)
833
# optional, default is 0 (unlimited)
834
max_children = 30
835
836
# maximum amount of persistent connections from this master to each agent host
837
# optional, but necessary if you use agent_persistent. It is reasonable to set the value
838
# as max_children, or less on the agent's hosts.
839
persistent_connections_limit = 30
840
841
# PID file, searchd process ID file name
842
# mandatory
843
pid_file = /var/run/sphinxsearch/searchd.pid
844
845
# seamless rotate, prevents rotate stalls if precaching huge datasets
846
# optional, default is 1
847
seamless_rotate = 1
848
849
# whether to forcibly preopen all indexes on startup
850
# optional, default is 1 (preopen everything)
851
preopen_indexes = 1
852
853
# whether to unlink .old index copies on succesful rotation.
854
# optional, default is 1 (do unlink)
855
unlink_old = 1
856
857
# attribute updates periodic flush timeout, seconds
858
# updates will be automatically dumped to disk this frequently
859
# optional, default is 0 (disable periodic flush)
860
#
861
# attr_flush_period = 900
862
863
864
# MVA updates pool size
865
# shared between all instances of searchd, disables attr flushes!
866
# optional, default size is 1M
867
mva_updates_pool = 1M
868
869
# max allowed network packet size
870
# limits both query packets from clients, and responses from agents
871
# optional, default size is 8M
872
max_packet_size = 8M
873
874
# max allowed per-query filter count
875
# optional, default is 256
876
max_filters = 256
877
878
# max allowed per-filter values count
879
# optional, default is 4096
880
max_filter_values = 4096
881
882
883
# socket listen queue length
884
# optional, default is 5
885
#
886
# listen_backlog = 5
887
888
889
# per-keyword read buffer size
890
# optional, default is 256K
891
#
892
# read_buffer = 256K
893
894
895
# unhinted read size (currently used when reading hits)
896
# optional, default is 32K
897
#
898
# read_unhinted = 32K
899
900
901
# max allowed per-batch query count (aka multi-query count)
902
# optional, default is 32
903
max_batch_queries = 32
904
905
906
# max common subtree document cache size, per-query
907
# optional, default is 0 (disable subtree optimization)
908
#
909
# subtree_docs_cache = 4M
910
911
912
# max common subtree hit cache size, per-query
913
# optional, default is 0 (disable subtree optimization)
914
#
915
# subtree_hits_cache = 8M
916
917
918
# multi-processing mode (MPM)
919
# known values are none, fork, prefork, and threads
920
# threads is required for RT backend to work
921
# optional, default is threads
922
workers = threads # for RT to work
923
924
925
# max threads to create for searching local parts of a distributed index
926
# optional, default is 0, which means disable multi-threaded searching
927
# should work with all MPMs (ie. does NOT require workers=threads)
928
#
929
# dist_threads = 4
930
931
932
# binlog files path; use empty string to disable binlog
933
# optional, default is build-time configured data directory
934
#
935
# binlog_path = # disable logging
936
# binlog_path = /var/db/sphinxsearch/data # binlog.001 etc will be created there
937
938
939
# binlog flush/sync mode
940
# 0 means flush and sync every second
941
# 1 means flush and sync every transaction
942
# 2 means flush every transaction, sync every second
943
# optional, default is 2
944
#
945
# binlog_flush = 2
946
947
948
# binlog per-file size limit
949
# optional, default is 128M, 0 means no limit
950
#
951
# binlog_max_log_size = 256M
952
953
954
# per-thread stack size, only affects workers=threads mode
955
# optional, default is 64K
956
#
957
# thread_stack = 128K
958
959
960
# per-keyword expansion limit (for dict=keywords prefix searches)
961
# optional, default is 0 (no limit)
962
#
963
# expansion_limit = 1000
964
965
966
# RT RAM chunks flush period
967
# optional, default is 0 (no periodic flush)
968
#
969
# rt_flush_period = 900
970
971
972
# query log file format
973
# optional, known values are plain and sphinxql, default is plain
974
#
975
# query_log_format = sphinxql
976
977
978
# version string returned to MySQL network protocol clients
979
# optional, default is empty (use Sphinx version)
980
#
981
# mysql_version_string = 5.0.37
982
983
984
# default server-wide collation
985
# optional, default is libc_ci
986
#
987
# collation_server = utf8_general_ci
988
989
990
# server-wide locale for libc based collations
991
# optional, default is C
992
#
993
# collation_libc_locale = ru_RU.UTF-8
994
995
996
# threaded server watchdog (only used in workers=threads mode)
997
# optional, values are 0 and 1, default is 1 (watchdog on)
998
#
999
# watchdog = 1
1000
1001
1002
# costs for max_predicted_time model, in (imaginary) nanoseconds
1003
# optional, default is "doc=64, hit=48, skip=2048, match=64"
1004
#
1005
# predicted_time_costs = doc=64, hit=48, skip=2048, match=64
1006
1007
1008
# current SphinxQL state (uservars etc) serialization path
1009
# optional, default is none (do not serialize SphinxQL state)
1010
#
1011
# sphinxql_state = sphinxvars.sql
1012
1013
1014
# maximum RT merge thread IO calls per second, and per-call IO size
1015
# useful for throttling (the background) OPTIMIZE INDEX impact
1016
# optional, default is 0 (unlimited)
1017
#
1018
# rt_merge_iops = 40
1019
# rt_merge_maxiosize = 1M
1020
1021
1022
# interval between agent mirror pings, in milliseconds
1023
# 0 means disable pings
1024
# optional, default is 1000
1025
#
1026
# ha_ping_interval = 0
1027
1028
1029
# agent mirror statistics window size, in seconds
1030
# stats older than the window size (karma) are retired
1031
# that is, they will not affect master choice of agents in any way
1032
# optional, default is 60 seconds
1033
#
1034
# ha_period_karma = 60
1035
1036
1037
# delay between preforked children restarts on rotation, in milliseconds
1038
# optional, default is 0 (no delay)
1039
#
1040
# prefork_rotation_throttle = 100
1041
1042
1043
# a prefix to prepend to the local file names when creating snippets
1044
# with load_files and/or load_files_scatter options
1045
# optional, default is empty
1046
#
1047
# snippets_file_prefix = /mnt/common/server1/
1048
}
1049
1050
#############################################################################
1051
## common settings
1052
#############################################################################
1053
1054
common
1055
{
1056
1057
# lemmatizer dictionaries base path
1058
# optional, defaut is /usr/local/share (see ./configure --datadir)
1059
#
1060
# lemmatizer_base = /usr/local/share/sphinx/dicts
1061
1062
1063
# how to handle syntax errors in JSON attributes
1064
# known values are 'ignore_attr' and 'fail_index'
1065
# optional, default is 'ignore_attr'
1066
#
1067
# on_json_attr_error = fail_index
1068
1069
1070
# whether to auto-convert numeric values from strings in JSON attributes
1071
# with auto-conversion, string value with actually numeric data
1072
# (as in {"key":"12345"}) gets stored as a number, rather than string
1073
# optional, allowed values are 0 and 1, default is 0 (do not convert)
1074
#
1075
# json_autoconv_numbers = 1
1076
1077
1078
# whether and how to auto-convert key names in JSON attributes
1079
# known value is 'lowercase'
1080
# optional, default is unspecified (do nothing)
1081
#
1082
# json_autoconv_keynames = lowercase
1083
1084
1085
# path to RLP root directory
1086
# optional, defaut is /usr/local/share (see ./configure --datadir)
1087
#
1088
# rlp_root = /usr/local/share/sphinx/rlp
1089
1090
1091
# path to RLP environment file
1092
# optional, defaut is /usr/local/share/rlp-environment.xml (see ./configure --datadir)
1093
#
1094
# rlp_environment = /usr/local/share/sphinx/rlp/rlp/etc/rlp-environment.xml
1095
1096
1097
# maximum total size of documents batched before processing them by the RLP
1098
# optional, default is 51200
1099
#
1100
# rlp_max_batch_size = 100k
1101
1102
1103
# maximum number of documents batched before processing them by the RLP
1104
# optional, default is 50
1105
#
1106
# rlp_max_batch_docs = 100
1107
1108
1109
# trusted plugin directory
1110
# optional, default is empty (disable UDFs)
1111
#
1112
# plugin_dir = /usr/local/sphinx/lib
1113
1114
}
1115
1116
# --eof--
utf8/plugins/sphinx/lib/sphinx/Keeper.pm
4
4
use warnings 'all';
5
5
use base qw(Contenido::Keeper);
6
6
use Contenido::Globals;
7
use Data::Dumper;
7
8
8
9
######################
9
10
# Отправить объект в поиск:
…
…
26
27
my $doc = shift;
27
28
return undef unless ref $doc && $doc->id;
28
29
29
my ($object) = $self->get_documents(
30
my ($object) = $keeper->get_documents(
30
31
class => 'sphinx::Search',
31
32
object_id => $doc->id,
32
33
object_class => $doc->class,
…
…
35
36
my $data = $doc->get_search_data;
36
37
return undef unless $data;
37
38
unless ( ref $object ) {
38
$object = sphinx::Search->new( $self );
39
$object = sphinx::Search->new( $keeper );
39
40
$object->status( 1 );
40
41
$object->is_deleted( 0 );
41
42
$object->object_id( $doc->id );
…
…
62
63
}
63
64
64
65
66
# Методы поиска
67
####################################################################
68
sub search {
69
my $self = shift;
70
my $text = shift;
71
return unless $text;
72
my (%opts) = @_;
73
74
my $result;
75
my $db_table = delete $opts{db_table} || $self->state->table_name;
76
my @wheres = ("MATCH(?)");
77
my @values = ($text);
78
my $count = delete $opts{count};
79
my $limit = delete $opts{limit};
80
return if $limit && ($limit =~ /\D/ || $limit < 0);
81
my $no_limit = delete $opts{no_limit};
82
unless ( $no_limit ) {
83
$limit ||= 1000;
84
}
85
my $offset = delete $opts{offset};
86
return if $offset && ($offset =~ /\D/ || $offset < 0);
87
my $return_value = delete $opts{return_value} || 'array_ref';
88
my $hash_by = delete $opts{hash_by} || 'object_id';
89
90
while ( my ($key, $val) = each %opts ) {
91
if ( ref $val eq 'ARRAY' ) {
92
push @wheres, "$key in (".join(',', map { '?' } @$val).")";
93
push @values, @$val;
94
} else {
95
push @wheres, "$key = ?";
96
push @values, $val;
97
}
98
}
99
my $query = "select ".($count ? 'count(*) as cnt' : '*, weight() as weight')." from $db_table where ".join( ' and ', @wheres );
100
if ( $limit ) {
101
$query .= " limit $limit ";
102
}
103
if ( $offset ) {
104
$query .= " offset $offset ";
105
}
106
warn "SEARCH QUERY: $query\n" if $DEBUG;
107
warn "SEARCH VALUES: ".Dumper( \@values ) if $DEBUG;
108
my $sth = $self->SQL->prepare_cached( $query );
109
$sth->execute( @values );
110
if ( $count ) {
111
$result = $sth->fetchrow_arrayref;
112
$result = $result->[0];
113
} else {
114
$result = [];
115
while ( my $row = $sth->fetchrow_hashref ) {
116
push @$result, $row;
117
}
118
}
119
return $result;
120
}
121
122
sub stemmed {
123
my $self = shift;
124
my $db_table = $self->state->table_name_stemmed;
125
return $self->search( @_, db_table => $db_table );
126
}
127
128
# МЕТОДЫ ДОСТУПА К СОЕДИНЕНИЯМ С БАЗОЙ УМНЫЕ
129
####################################################################
130
# получение соединения с базой или установка нового если его не было
131
sub SQL {
132
my $self = shift;
133
return ($self->connect_check() ? $self->{SQL} : undef);
134
}
135
136
# -------------------------------------------------------------------------------------------------
137
# Открываем соединение с базой данных
138
# -------------------------------------------------------------------------------------------------
139
sub connect {
140
my $self = shift;
141
#соединение уже есть
142
if ($self->is_connected) {
143
} else {
144
unless ($self->{SQL} = $self->db_connect) {
145
warn "Не могу соединиться с базой данных";
146
die;
147
}
148
$self->{SQL}->do("SET NAMES '".$self->state->db_client_encoding."'") if ($self->state->db_client_encoding);
149
}
150
151
$self->{_connect_ok} = 1;
152
return 1;
153
}
154
155
#проверка соединения с базой кеширующая состояние соединения
156
sub connect_check {
157
my $self = shift;
158
return 1 if ($self->{_connect_ok});
159
if ($self->is_connected) {
160
$self->{_connect_ok} = 1;
161
return 1;
162
} else {
163
if ($self->connect) {
164
return 1;
165
} else {
166
#сюда по логике попадать не должно так как die вылететь должен
167
warn "Connect failed\n";
168
return 0;
169
}
170
}
171
}
172
173
sub db_connect {
174
my $self = shift;
175
my $dbh = DBI->connect('DBI:mysql:host='.$self->{db_host}.';port='.$self->{db_port}.';mysql_enable_utf8=1')
176
|| die "Contenido Error: Не могу соединиться с Sphinx базой данных\n";
177
178
# $dbh->{'AutoCommit'} = 1;
179
# $dbh->{mysql_auto_reconnect} = 1;
180
181
return $dbh;
182
}
183
184
sub is_connected {
185
my $self = shift;
186
if ( ref $self->{SQL} and $self->{SQL}->can('ping') and $self->{SQL}->ping() ) {
187
$self->{_connect_ok} = 1;
188
return 1;
189
} else {
190
$self->{_connect_ok} = 0;
191
return 0;
192
}
193
194
# warn 'Check if MySQL DB connected: '.(ref $self && exists $self->{SQL} && ref $self->{SQL} ? 1 : 0 ) if $DEBUG;
195
# return ( ref($self) && exists $self->{SQL} && ref $self->{SQL} );
196
}
65
197
1;
utf8/plugins/sphinx/lib/sphinx/State.pm.proto
12
12
bless $self, $class;
13
13
14
14
# configured
15
$self->{debug} = (lc('') eq 'yes');
16
$self->{project} = '';
17
$self->{contenido_notab} = 0;
15
$self->{debug} = (lc('@DEBUG@') eq 'yes');
16
$self->{project} = '@PROJECT@';
17
$self->{contenido_notab} = 1;
18
18
$self->{tab_name} = 'sphinx';
19
19
20
20
# зашитая конфигурация плагина
21
$self->{db_type} = 'none'; ### For REAL database use 'remote'
22
$self->{db_keepalive} = 0;
23
$self->{db_host} = '';
21
$self->{db_type} = 'remote'; ### For REAL database use 'remote'
22
$self->{db_keepalive} = 0;
23
$self->{db_host} = '@SPHINX_HOST@';
24
24
$self->{db_name} = '';
25
25
$self->{db_user} = '';
26
$self->{db_password} = '';
27
$self->{db_port} = '';
26
$self->{db_password} = '';
27
$self->{db_port} = '@SPHINX_PORT@';
28
28
$self->{store_method} = 'toast';
29
29
$self->{cascade} = 1;
30
30
$self->{db_prepare} = 0;
31
31
32
$self->{memcached_enable} = lc( '' ) eq 'yes' ? 1 : 0;
32
$self->{memcached_enable} = lc( '@MEMCACHED_ENABLE@' ) eq 'yes' ? 1 : 0;
33
33
$self->{memcached_enable_compress} = 1;
34
$self->{memcached_backend} = '';
35
$self->{memcached_servers} = [qw()];
34
$self->{memcached_backend} = '@MEMCACHED_BACKEND@';
35
$self->{memcached_servers} = [qw(@MEMCACHED_SERVERS@)];
36
36
$self->{memcached_busy_lock} = 60;
37
$self->{memcached_delayed} = lc('') eq 'yes' ? 1 : 0;
37
$self->{memcached_delayed} = lc('@MEMCACHED_DELAYED@') eq 'yes' ? 1 : 0;
38
38
39
39
$self->{serialize_with} = 'json'; ### or 'dumper'
40
40
…
…
44
44
$self->{images_directory} = '/nonexistent';
45
45
$self->{preview} = '0';
46
46
47
$self->{table_name} = '@SPHINX_TABLE@';
48
$self->{table_name_stemmed} = '@SPHINX_TABLE_STEMMED@';
49
47
50
$self->_init_();
48
51
$self;
49
52
}
…
…
90
93
data_directory
91
94
images_directory
92
95
preview
96
97
table_name
98
table_name_stemmed
93
99
);
94
100
}
95
101
Небольшая справка по веткам
cnddist – контейнер, в котором хранятся все дистрибутивы всех библиотек и программных пакетов, которые использовались при построении различных версий Contenido. Если какой-то библиотеки в данном хранилище нет, инсталлятор сделает попытку "подтянуть" ее с веба (например, с CPAN). Если библиотека слишком старая, есть очень большая вероятность, что ее там уже нет. Поэтому мы храним весь хлам от всех сборок. Если какой-то дистрибутив вдруг отсутствует в cnddist - напишите нам, мы положим его туда.
koi8 – отмирающая ветка, чей код, выдача и все внутренние библиотеки заточены на кодировку KOI8-R. Вносятся только те дополнения, которые касаются внешнего вида и функционала админки, баги ядра, обязательные обновления портов и мелочи, которые легко скопипастить. В дальнейшем планируется полная остановка поддержки по данной ветке.
utf8 – актуальная ветка, заточенная под UTF-8.
Внутри каждой ветки: core – исходники ядра; install – скрипт установки инсталляции; plugins – плагины; samples – "готовые к употреблению" проекты, которые можно поставить, запустить и посмотреть, как они работают.