머글랭 가이드 the beginning

By | 2021년 9월 24일
Table of Content

머글랭 가이드 the beginning

참조

참조
참조
참조

데이타 구하기

https://www.localdata.kr/

import to MariaDB

file -bi data.csv
application/csv; charset=unknown-8bit

iconv -c -f euc-kr -t utf8 data.csv > data-utf8.csv

head -20 data-utf8.csv
mysql -u root -p
create user '사용자'@'localhost' identified by '비밀번호';

create database DB이름;

grant all privileges on DB이름.* to '사용자'@'localhost';

flush privileges;

use DB이름;

create table tbl_import(
    `번호` varchar(100),
    `개방서비스명` varchar(100),
    `개방서비스id` varchar(100),
    `개방자치단체코드` varchar(100),
    `관리번호` varchar(100),
    `인허가일자` varchar(100),
    `인허가취소일자` varchar(100),
    `영업상태구분코드` varchar(100),
    `영업상태명` varchar(100),
    `상세영업상태코드` varchar(100),
    `상세영업상태명` varchar(100),
    `폐업일자` varchar(100),
    `휴업시작일자` varchar(100),
    `휴업종료일자` varchar(100),
    `재개업일자` varchar(100),
    `소재지전화` varchar(100),
    `소재지면적` varchar(100),
    `소재지우편번호` varchar(100),
    `소재지전체주소` varchar(256),
    `도로명전체주소` varchar(256),
    `도로명우편번호` varchar(100),
    `사업장명` varchar(100),
    `최종수정시점` varchar(100),
    `데이터갱신구분` varchar(100),
    `데이터갱신일자` varchar(100),
    `업태구분명` varchar(100),
    `좌표정보(x)` varchar(100),
    `좌표정보(y)` varchar(100),
    `위생업태명` varchar(100),
    `남성종사자수` varchar(100),
    `여성종사자수` varchar(100),
    `영업장주변구분명` varchar(100),
    `등급구분명` varchar(100),
    `급수시설구분명` varchar(100),
    `총종업원수` varchar(100),
    `본사종업원수` varchar(100),
    `공장사무직종업원수` varchar(100),
    `공장판매직종업원수` varchar(100),
    `공장생산직종업원수` varchar(100),
    `건물소유구분명` varchar(100),
    `보증액` varchar(100),
    `월세액` varchar(100),
    `다중이용업소여부` varchar(100),
    `시설총규모` varchar(100),
    `전통업소지정번호` varchar(100),
    `전통업소주된음식` varchar(100),
    `홈페이지` varchar(256)
);

LOAD DATA INFILE '/tmp/data-utf8.csv' IGNORE
INTO TABLE tbl_import
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;

select 사업장명 from tbl_import where 사업장명 like '%돼지국밥%' limit 10;

select 인허가일자, 소재지전화, 소재지전체주소, 도로명전체주소, 사업장명, `좌표정보(x)`, `좌표정보(y)`
from tbl_import
where 영업상태구분코드 = '01'
limit 10;
create table tbl_restaurant_info(
    id bigint(20) unsigned not null auto_increment,
    primary key (id),
    unique key unique_id (id),
    restaurant_name varchar(255),
    start_date varchar(10),
    tel varchar(32),
    address varchar(255),
    road_address varchar(255),
    x varchar(255),
    y varchar(255),
    coordinate point,
    longitude varchar(255),
    latitude varchar(255),
    modify_time timestamp not null default current_timestamp on update current_timestamp,
    insert_time timestamp not null default current_timestamp
);

insert into tbl_restaurant_info(
    restaurant_name,
    start_date,
    tel,
    address,
    road_address,
    x,
    y
)
select 사업장명, 인허가일자, 소재지전화, 소재지전체주소, 도로명전체주소, `좌표정보(x)`, `좌표정보(y)`
from tbl_import
where 영업상태구분코드 = '01';

select id, restaurant_name, start_date, x, y, modify_time, insert_time
from tbl_restaurant_info
limit 10;

전화번호 교정

update tbl_restaurant_info
set tel = replace(tel, ' ', '-')
where tel <> replace(tel, ' ', '-');

-- 실서비스에서 정규식 사용하면 총살당합니다.
update tbl_restaurant_info
set tel = concat(substring(tel, 1, length(tel) -4), '-', right(tel, 4))
where tel regexp '[0-9]{7,8}$';

-- 5-10 회 반복할 것.
update tbl_restaurant_info
set tel = substring(tel, 2, 100)
where tel regexp '^00';

update tbl_restaurant_info r
set tel = concat(substring(tel, 1, 2), '-', substring(tel, 3, 100))
where
    1 = 1
    and r.tel <> ''
    and r.tel regexp '^02'
    and r.tel regexp '^[0-9]{5,6}';

update tbl_restaurant_info r
set tel = concat(substring(tel, 1, 3), '-', substring(tel, 4, 100))
where
    1 = 1
    and r.tel <> ''
    and not r.tel regexp '^02'
    and r.tel regexp '^[0-9]{6,7}';

-- 2-3 회 반복할 것.
update tbl_restaurant_info r
set tel = replace(tel, '-00', '-')
where
    1 = 1
    and r.tel <> '';

update tbl_restaurant_info r
set tel = replace(tel, '-0', '-')
where
    1 = 1
    and r.tel <> '';

update tbl_restaurant_info r
set tel = replace(tel, '--', '-')
where
    1 = 1
    and r.tel <> '';

중부원점TM(EPSG:2097) to WGS84(EPSG:4326)

pip3 install PyMySQL
pip3 install pyproj
vi convert.py
------------------------------
from pyproj import Proj, transform
import pymysql

import warnings
warnings.filterwarnings('ignore')

def isfloat(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

# Projection 정의
# 중부원점(Bessel): 서울 등 중부지역 EPSG:2097
proj_1 = Proj(init='epsg:2097')

# WGS84 경위도: GPS가 사용하는 좌표계 EPSG:4326
proj_2 = Proj(init='epsg:4326')

conn = pymysql.connect(
    user='mg',
    passwd='********',
    host='127.0.0.1',
    db='mg',
    charset='utf8'
)

cursor = conn.cursor()

sql_select = """
select id, x, y from tbl_restaurant_info;
"""

sql_update = """
update tbl_restaurant_info
set longitude = %f, latitude = %f
where id = %d;
"""

cursor.execute(sql_select)
result = cursor.fetchall()

for row in result:
    idx, x, y = row[0], row[1], row[2]

    if x != '' and y != '':
        if isfloat(x) and isfloat(y):
            x_, y_ = transform(proj_1, proj_2, x, y)
            cursor.execute(sql_update % (x_, y_, int(idx)))
            if idx % 500 == 0:
                conn.commit()
                print('committed : %d' % idx)

conn.commit()

conn.close()
------------------------------

Send to ElasticSearch

PUT /restaurant_info?pretty
{
    "settings" : {
        "number_of_shards" : 1,
        "number_of_replicas" : 0
    },
    "mappings" : {
        "properties" : {
            "restaurant_name" : { "type" : "text" },
            "start_date" : {
                "type" : "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyyMMdd||epoch_millis"
            },
            "tel" : { "type" : "keyword" },
            "address" : { "type" : "text" },
            "road_address" : { "type" : "text" },
            "coordinate" : { "type" : "geo_point" },
            "longitude" : { "type" : "float" },
            "latitude" : { "type" : "float" },
            "modify_time" : {
                "type" : "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyyMMdd||epoch_millis"
            },
            "insert_time" : {
                "type" : "date",
                "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyyMMdd||epoch_millis"
            }
        }
    }
}

답글 남기기