Table of Contents

Elasticsearch 검색어에서 속성 추출 v2

여기 에서 생각했던 아이디어를 좀 더 다듬어 본다.

문제점

다수의 ES 서버에 동의어 사전 텍스트 파일을 배포하는 것도 문제점이고,
다시 작업을 해보니 ngram 이 정상 작동하여 nori 형태소 분석없이 가능했다.

인덱스 생성

curl -XDELETE 'localhost:9200/item_attribute_brand?pretty'
curl -XDELETE 'localhost:9200/item_attribute_color?pretty'
curl -XDELETE 'localhost:9200/item_attribute_category?pretty'

curl -XPUT 'nb.skyer9.pe.kr:9200/item_attribute_brand?pretty' -H 'Content-Type: application/json' -d'
{
  "settings": {
    "number_of_shards": 5,
    "max_ngram_diff": 20,
    "number_of_replicas": 1,
    "analysis": {
      "tokenizer": {
        "my_ngram_tokenizer": {
          "type": "ngram",
          "min_gram": 1,
          "max_gram": 20,
          "token_chars": [
            "letter",
            "digit",
            "punctuation"
          ]
        }
      },
      "analyzer": {
        "my_ngram_analyzer": {
          "type": "custom",
          "tokenizer": "my_ngram_tokenizer",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "text",
        "analyzer": "whitespace",
        "search_analyzer": "my_ngram_analyzer"
      },
      "kwd": {
        "type": "keyword"
      }
    }
  }
}
'

curl -XPUT 'nb.skyer9.pe.kr:9200/item_attribute_color?pretty' -H 'Content-Type: application/json' -d'
{
  "settings": {
    "number_of_shards": 5,
    "max_ngram_diff": 20,
    "number_of_replicas": 1,
    "analysis": {
      "tokenizer": {
        "my_ngram_tokenizer": {
          "type": "ngram",
          "min_gram": 1,
          "max_gram": 20,
          "token_chars": [
            "letter",
            "digit",
            "punctuation"
          ]
        }
      },
      "analyzer": {
        "my_ngram_analyzer": {
          "type": "custom",
          "tokenizer": "my_ngram_tokenizer",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "text",
        "analyzer": "whitespace",
        "search_analyzer": "my_ngram_analyzer"
      },
      "kwd": {
        "type": "keyword"
      }
    }
  }
}
'

curl -XPUT 'nb.skyer9.pe.kr:9200/item_attribute_category?pretty' -H 'Content-Type: application/json' -d'
{
  "settings": {
    "number_of_shards": 5,
    "max_ngram_diff": 20,
    "number_of_replicas": 1,
    "analysis": {
      "tokenizer": {
        "my_ngram_tokenizer": {
          "type": "ngram",
          "min_gram": 1,
          "max_gram": 20,
          "token_chars": [
            "letter",
            "digit",
            "punctuation"
          ]
        }
      },
      "analyzer": {
        "my_ngram_analyzer": {
          "type": "custom",
          "tokenizer": "my_ngram_tokenizer",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id": {
        "type": "text",
        "analyzer": "whitespace",
        "search_analyzer": "my_ngram_analyzer"
      },
      "kwd": {
        "type": "keyword"
      }
    }
  }
}
'

데이타입력

brand.json

{ "index": { "_index": "item_attribute_brand", "_id": "아이코닉" } }
{ "id": "아이코닉", "kwd": "아이코닉" }
{ "index": { "_index": "item_attribute_brand", "_id": "iconic" } }
{ "id": "iconic", "kwd": "아이코닉" }

color.json

{ "index": { "_index": "item_attribute_color", "_id": "검정" } }
{ "id": "검정", "kwd": "검정" }
{ "index": { "_index": "item_attribute_color", "_id": "black" } }
{ "id": "black", "kwd": "검정" }

category.json

{ "index": { "_index": "item_attribute_category", "_id": "우산" } }
{ "id": "우산", "kwd": "우산" }
{ "index": { "_index": "item_attribute_category", "_id": "umbrella" } }
{ "id": "umbrella", "kwd": "우산" }

curl -XPOST 'localhost:9200/_bulk?pretty' -H 'Content-Type: application/json' --data-binary @brand.json
curl -XPOST 'localhost:9200/_bulk?pretty' -H 'Content-Type: application/json' --data-binary @color.json
curl -XPOST 'localhost:9200/_bulk?pretty' -H 'Content-Type: application/json' --data-binary @category.json

검색어 analyze

curl -XGET 'http://localhost:9200/item_attribute_brand/_search?pretty' -H 'Content-Type: application/json' -d'
{
  "from": 0,
  "size": 20,
  "sort": {
    "_score": "desc"
  },
  "query": {
    "bool": {
      "should": {
        "match": {
          "id": "아이코닉검정우산"
        }
      }
    }
  }
}'

curl -XGET 'http://localhost:9200/item_attribute_color/_search?pretty' -H 'Content-Type: application/json' -d'
{
  "from": 0,
  "size": 20,
  "sort": {
    "_score": "desc"
  },
  "query": {
    "bool": {
      "should": {
        "match": {
          "id": "아이코닉검정우산"
        }
      }
    }
  }
}'

curl -XGET 'http://localhost:9200/item_attribute_category/_search?pretty' -H 'Content-Type: application/json' -d'
{
  "from": 0,
  "size": 20,
  "sort": {
    "_score": "desc"
  },
  "query": {
    "bool": {
      "should": {
        "match": {
          "id": "아이코닉검정우산"
        }
      }
    }
  }
}'

결론

아무런 추가 기능 없이 ngram 만으로 검색어에서 속성추출이 가능하다.

Post Views: 877

Elasticsearch 검색어에서 속성 추출 v2

문제점

인덱스 생성

데이타입력

검색어 analyze

결론

답글 남기기 응답 취소