Last active
August 29, 2015 14:00
-
-
Save fumi/11276247 to your computer and use it in GitHub Desktop.
ckan-2.2用暫定schema.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8" ?> | |
<!-- | |
Licensed to the Apache Software Foundation (ASF) under one or more | |
contributor license agreements. See the NOTICE file distributed with | |
this work for additional information regarding copyright ownership. | |
The ASF licenses this file to You under the Apache License, Version 2.0 | |
(the "License"); you may not use this file except in compliance with | |
the License. You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
--> | |
<schema name="ckan" version="2.0"> | |
<types> | |
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> | |
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> | |
<fieldtype name="binary" class="solr.BinaryField"/> | |
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> | |
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> | |
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> | |
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> | |
<filter class="solr.ASCIIFoldingFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
<!-- A general unstemmed text field - good if one does not know the language of the field --> | |
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100"> | |
<analyzer type="index"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.WhitespaceTokenizerFactory"/> | |
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> | |
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
<fieldType name="text_ja" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="false"> | |
<!-- Kuromoji Japanese morphological analyzer/tokenizer (JapaneseTokenizer) | |
Kuromoji has a search mode (default) that does segmentation useful for search. A heuristic | |
is used to segment compounds into its parts and the compound itself is kept as synonym. | |
Valid values for attribute mode are: | |
normal: regular segmentation | |
search: segmentation useful for search with synonyms compounds (default) | |
extended: same as search mode, but unigrams unknown words (experimental) | |
For some applications it might be good to use search mode for indexing and normal mode for | |
queries to reduce recall and prevent parts of compounds from being matched and highlighted. | |
Use <analyzer type="index"> and <analyzer type="query"> for this and mode normal in query. | |
Kuromoji also has a convenient user dictionary feature that allows overriding the statistical | |
model with your own entries for segmentation, part-of-speech tags and readings without a need | |
to specify weights. Notice that user dictionaries have not been subject to extensive testing. | |
User dictionary attributes are: | |
userDictionary: user dictionary filename | |
userDictionaryEncoding: user dictionary encoding (default is UTF-8) | |
See lang/userdict_ja.txt for a sample user dictionary file. | |
See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. | |
--> | |
<analyzer type="index"> | |
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> | |
<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> | |
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) --> | |
<filter class="solr.JapaneseBaseFormFilterFactory"/> | |
<!-- Removes tokens with certain part-of-speech tags --> | |
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> | |
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> | |
<filter class="solr.CJKWidthFilterFactory"/> | |
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> | |
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> | |
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> | |
<!-- Lower-cases romaji characters --> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
</analyzer> | |
<analyzer type="query"> | |
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> | |
<!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> | |
<!-- Reduces inflected verbs and adjectives to their base/dictionary forms (辞書形) --> | |
<filter class="solr.JapaneseBaseFormFilterFactory"/> | |
<!-- Removes tokens with certain part-of-speech tags --> | |
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt" enablePositionIncrements="true"/> | |
<!-- Normalizes full-width romaji to half-width and half-width kana to full-width (Unicode NFKC subset) --> | |
<filter class="solr.CJKWidthFilterFactory"/> | |
<!-- Removes common tokens typically not useful for search, but have a negative effect on ranking --> | |
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ja.txt" enablePositionIncrements="true" /> | |
<!-- Normalizes common katakana spelling variations by removing any last long sound character (U+30FC) --> | |
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/> | |
<!-- Lower-cases romaji characters --> | |
<filter class="solr.LowerCaseFilterFactory"/> | |
</analyzer> | |
</fieldType> | |
</types> | |
<fields> | |
<field name="index_id" type="string" indexed="true" stored="true" required="true" /> | |
<field name="id" type="string" indexed="true" stored="true" required="true" /> | |
<field name="site_id" type="string" indexed="true" stored="true" required="true" /> | |
<field name="title" type="text_ja" indexed="true" stored="true" /> | |
<field name="entity_type" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="dataset_type" type="string" indexed="true" stored="true" /> | |
<field name="state" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="name" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="revision_id" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="version" type="string" indexed="true" stored="true" /> | |
<field name="url" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="ckan_url" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="download_url" type="string" indexed="true" stored="true" omitNorms="true" /> | |
<field name="notes" type="text_ja" indexed="true" stored="true"/> | |
<field name="author" type="text_ja" indexed="true" stored="true" /> | |
<field name="author_email" type="textgen" indexed="true" stored="true" /> | |
<field name="maintainer" type="text_ja" indexed="true" stored="true" /> | |
<field name="maintainer_email" type="textgen" indexed="true" stored="true" /> | |
<field name="license" type="string" indexed="true" stored="true" /> | |
<field name="license_id" type="string" indexed="true" stored="true" /> | |
<field name="ratings_count" type="int" indexed="true" stored="false" /> | |
<field name="ratings_average" type="float" indexed="true" stored="false" /> | |
<field name="tags" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="groups" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="capacity" type="string" indexed="true" stored="true" multiValued="false"/> | |
<field name="res_description" type="text_ja" indexed="true" stored="true" multiValued="true"/> | |
<field name="res_format" type="string" indexed="true" stored="true" multiValued="true"/> | |
<field name="res_url" type="string" indexed="true" stored="true" multiValued="true"/> | |
<!-- catchall field, containing all other searchable text fields (implemented | |
via copyField further on in this schema --> | |
<field name="text" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="urls" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="depends_on" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="dependency_of" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="derives_from" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="has_derivation" type="text" indexed="true" stored="false" multiValued="true"/> | |
<field name="links_to" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="linked_from" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="child_of" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="parent_of" type="text_ja" indexed="true" stored="false" multiValued="true"/> | |
<field name="views_total" type="int" indexed="true" stored="false"/> | |
<field name="views_recent" type="int" indexed="true" stored="false"/> | |
<field name="resources_accessed_total" type="int" indexed="true" stored="false"/> | |
<field name="resources_accessed_recent" type="int" indexed="true" stored="false"/> | |
<field name="metadata_created" type="date" indexed="true" stored="true" multiValued="false"/> | |
<field name="metadata_modified" type="date" indexed="true" stored="true" multiValued="false"/> | |
<field name="indexed_ts" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> | |
<!-- Copy the title field into titleString, and treat as a string | |
(rather than text type). This allows us to sort on the titleString --> | |
<field name="title_string" type="string" indexed="true" stored="false" /> | |
<field name="data_dict" type="string" indexed="false" stored="true" /> | |
<field name="validated_data_dict" type="string" indexed="false" stored="true" /> | |
<field name="_version_" type="string" indexed="true" stored="true"/> | |
<dynamicField name="*_date" type="date" indexed="true" stored="true" multiValued="false"/> | |
<dynamicField name="extras_*" type="text_ja" indexed="true" stored="true" multiValued="false"/> | |
<dynamicField name="vocab_*" type="string" indexed="true" stored="true" multiValued="true"/> | |
<dynamicField name="*" type="string" indexed="true" stored="false"/> | |
</fields> | |
<uniqueKey>index_id</uniqueKey> | |
<defaultSearchField>text</defaultSearchField> | |
<solrQueryParser defaultOperator="AND"/> | |
<copyField source="url" dest="urls"/> | |
<copyField source="ckan_url" dest="urls"/> | |
<copyField source="download_url" dest="urls"/> | |
<copyField source="res_url" dest="urls"/> | |
<copyField source="extras_*" dest="text"/> | |
<copyField source="vocab_*" dest="text"/> | |
<copyField source="urls" dest="text"/> | |
<copyField source="name" dest="text"/> | |
<copyField source="title" dest="text"/> | |
<copyField source="text" dest="text"/> | |
<copyField source="license" dest="text"/> | |
<copyField source="notes" dest="text"/> | |
<copyField source="tags" dest="text"/> | |
<copyField source="groups" dest="text"/> | |
<copyField source="res_description" dest="text"/> | |
<copyField source="maintainer" dest="text"/> | |
<copyField source="author" dest="text"/> | |
</schema> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment