{"id":3480,"date":"2021-09-24T13:34:07","date_gmt":"2021-09-24T04:34:07","guid":{"rendered":"https:\/\/www.skyer9.pe.kr\/wordpress\/?p=3480"},"modified":"2021-10-14T09:31:35","modified_gmt":"2021-10-14T00:31:35","slug":"scrapy","status":"publish","type":"post","link":"https:\/\/www.skyer9.pe.kr\/wordpress\/?p=3480","title":{"rendered":"\uba38\uae00\ub7ad \uac00\uc774\ub4dc the beginning"},"content":{"rendered":"<h1>\uba38\uae00\ub7ad \uac00\uc774\ub4dc the beginning<\/h1>\n<p><a href=\"https:\/\/ai-creator.tistory.com\/31\">\ucc38\uc870<\/a><\/p>\n<p><a href=\"https:\/\/l0o02.github.io\/2018\/06\/19\/python-scrapy-1\/\">\ucc38\uc870<\/a><br \/>\n<a href=\"https:\/\/www.scrapingbee.com\/blog\/scrapy-javascript\/\">\ucc38\uc870<\/a><br \/>\n<a href=\"https:\/\/docs.scrapy.org\/en\/latest\/intro\/tutorial.html\">\ucc38\uc870<\/a><\/p>\n<h2>\ub370\uc774\ud0c0 \uad6c\ud558\uae30<\/h2>\n<p><a href=\"https:\/\/www.localdata.kr\/\">https:\/\/www.localdata.kr\/<\/a><\/p>\n<h2>import to MariaDB<\/h2>\n<pre><code class=\"language-bash\">file -bi data.csv\napplication\/csv; charset=unknown-8bit\n\niconv -c -f euc-kr -t utf8 data.csv &gt; data-utf8.csv\n\nhead -20 data-utf8.csv<\/code><\/pre>\n<pre><code class=\"language-bash\">mysql -u root -p<\/code><\/pre>\n<pre><code class=\"language-sql\">create user &#039;\uc0ac\uc6a9\uc790&#039;@&#039;localhost&#039; identified by &#039;\ube44\ubc00\ubc88\ud638&#039;;\n\ncreate database DB\uc774\ub984;\n\ngrant all privileges on DB\uc774\ub984.* to &#039;\uc0ac\uc6a9\uc790&#039;@&#039;localhost&#039;;\n\nflush privileges;\n\nuse DB\uc774\ub984;\n\ncreate table tbl_import(\n    `\ubc88\ud638` varchar(100),\n    `\uac1c\ubc29\uc11c\ube44\uc2a4\uba85` varchar(100),\n    `\uac1c\ubc29\uc11c\ube44\uc2a4id` varchar(100),\n    `\uac1c\ubc29\uc790\uce58\ub2e8\uccb4\ucf54\ub4dc` varchar(100),\n    `\uad00\ub9ac\ubc88\ud638` varchar(100),\n    `\uc778\ud5c8\uac00\uc77c\uc790` varchar(100),\n    `\uc778\ud5c8\uac00\ucde8\uc18c\uc77c\uc790` varchar(100),\n    `\uc601\uc5c5\uc0c1\ud0dc\uad6c\ubd84\ucf54\ub4dc` varchar(100),\n    `\uc601\uc5c5\uc0c1\ud0dc\uba85` varchar(100),\n    `\uc0c1\uc138\uc601\uc5c5\uc0c1\ud0dc\ucf54\ub4dc` varchar(100),\n    `\uc0c1\uc138\uc601\uc5c5\uc0c1\ud0dc\uba85` varchar(100),\n    `\ud3d0\uc5c5\uc77c\uc790` varchar(100),\n    `\ud734\uc5c5\uc2dc\uc791\uc77c\uc790` varchar(100),\n    `\ud734\uc5c5\uc885\ub8cc\uc77c\uc790` varchar(100),\n    `\uc7ac\uac1c\uc5c5\uc77c\uc790` varchar(100),\n    `\uc18c\uc7ac\uc9c0\uc804\ud654` varchar(100),\n    `\uc18c\uc7ac\uc9c0\uba74\uc801` varchar(100),\n    `\uc18c\uc7ac\uc9c0\uc6b0\ud3b8\ubc88\ud638` varchar(100),\n    `\uc18c\uc7ac\uc9c0\uc804\uccb4\uc8fc\uc18c` varchar(256),\n    `\ub3c4\ub85c\uba85\uc804\uccb4\uc8fc\uc18c` varchar(256),\n    `\ub3c4\ub85c\uba85\uc6b0\ud3b8\ubc88\ud638` varchar(100),\n    `\uc0ac\uc5c5\uc7a5\uba85` varchar(100),\n    `\ucd5c\uc885\uc218\uc815\uc2dc\uc810` varchar(100),\n    `\ub370\uc774\ud130\uac31\uc2e0\uad6c\ubd84` varchar(100),\n    `\ub370\uc774\ud130\uac31\uc2e0\uc77c\uc790` varchar(100),\n    `\uc5c5\ud0dc\uad6c\ubd84\uba85` varchar(100),\n    `\uc88c\ud45c\uc815\ubcf4(x)` varchar(100),\n    `\uc88c\ud45c\uc815\ubcf4(y)` varchar(100),\n    `\uc704\uc0dd\uc5c5\ud0dc\uba85` varchar(100),\n    `\ub0a8\uc131\uc885\uc0ac\uc790\uc218` varchar(100),\n    `\uc5ec\uc131\uc885\uc0ac\uc790\uc218` varchar(100),\n    `\uc601\uc5c5\uc7a5\uc8fc\ubcc0\uad6c\ubd84\uba85` varchar(100),\n    `\ub4f1\uae09\uad6c\ubd84\uba85` varchar(100),\n    `\uae09\uc218\uc2dc\uc124\uad6c\ubd84\uba85` varchar(100),\n    `\ucd1d\uc885\uc5c5\uc6d0\uc218` varchar(100),\n    `\ubcf8\uc0ac\uc885\uc5c5\uc6d0\uc218` varchar(100),\n    `\uacf5\uc7a5\uc0ac\ubb34\uc9c1\uc885\uc5c5\uc6d0\uc218` varchar(100),\n    `\uacf5\uc7a5\ud310\ub9e4\uc9c1\uc885\uc5c5\uc6d0\uc218` varchar(100),\n    `\uacf5\uc7a5\uc0dd\uc0b0\uc9c1\uc885\uc5c5\uc6d0\uc218` varchar(100),\n    `\uac74\ubb3c\uc18c\uc720\uad6c\ubd84\uba85` varchar(100),\n    `\ubcf4\uc99d\uc561` varchar(100),\n    `\uc6d4\uc138\uc561` varchar(100),\n    `\ub2e4\uc911\uc774\uc6a9\uc5c5\uc18c\uc5ec\ubd80` varchar(100),\n    `\uc2dc\uc124\ucd1d\uaddc\ubaa8` varchar(100),\n    `\uc804\ud1b5\uc5c5\uc18c\uc9c0\uc815\ubc88\ud638` varchar(100),\n    `\uc804\ud1b5\uc5c5\uc18c\uc8fc\ub41c\uc74c\uc2dd` varchar(100),\n    `\ud648\ud398\uc774\uc9c0` varchar(256)\n);\n\nLOAD DATA INFILE &#039;\/tmp\/data-utf8.csv&#039; IGNORE\nINTO TABLE tbl_import\nFIELDS TERMINATED BY &#039;,&#039;\nENCLOSED BY &#039;&quot;&#039;\nLINES TERMINATED BY &#039;\\n&#039;\nIGNORE 1 ROWS;\n\nselect \uc0ac\uc5c5\uc7a5\uba85 from tbl_import where \uc0ac\uc5c5\uc7a5\uba85 like &#039;%\ub3fc\uc9c0\uad6d\ubc25%&#039; limit 10;\n\nselect \uc778\ud5c8\uac00\uc77c\uc790, \uc18c\uc7ac\uc9c0\uc804\ud654, \uc18c\uc7ac\uc9c0\uc804\uccb4\uc8fc\uc18c, \ub3c4\ub85c\uba85\uc804\uccb4\uc8fc\uc18c, \uc0ac\uc5c5\uc7a5\uba85, `\uc88c\ud45c\uc815\ubcf4(x)`, `\uc88c\ud45c\uc815\ubcf4(y)`\nfrom tbl_import\nwhere \uc601\uc5c5\uc0c1\ud0dc\uad6c\ubd84\ucf54\ub4dc = &#039;01&#039;\nlimit 10;<\/code><\/pre>\n<pre><code class=\"language-sql\">create table tbl_restaurant_info(\n    id bigint(20) unsigned not null auto_increment,\n    primary key (id),\n    unique key unique_id (id),\n    restaurant_name varchar(255),\n    start_date varchar(10),\n    tel varchar(32),\n    address varchar(255),\n    road_address varchar(255),\n    x varchar(255),\n    y varchar(255),\n    coordinate point,\n    longitude varchar(255),\n    latitude varchar(255),\n    modify_time timestamp not null default current_timestamp on update current_timestamp,\n    insert_time timestamp not null default current_timestamp\n);\n\ninsert into tbl_restaurant_info(\n    restaurant_name,\n    start_date,\n    tel,\n    address,\n    road_address,\n    x,\n    y\n)\nselect \uc0ac\uc5c5\uc7a5\uba85, \uc778\ud5c8\uac00\uc77c\uc790, \uc18c\uc7ac\uc9c0\uc804\ud654, \uc18c\uc7ac\uc9c0\uc804\uccb4\uc8fc\uc18c, \ub3c4\ub85c\uba85\uc804\uccb4\uc8fc\uc18c, `\uc88c\ud45c\uc815\ubcf4(x)`, `\uc88c\ud45c\uc815\ubcf4(y)`\nfrom tbl_import\nwhere \uc601\uc5c5\uc0c1\ud0dc\uad6c\ubd84\ucf54\ub4dc = &#039;01&#039;;\n\nselect id, restaurant_name, start_date, x, y, modify_time, insert_time\nfrom tbl_restaurant_info\nlimit 10;<\/code><\/pre>\n<h2>\uc804\ud654\ubc88\ud638 \uad50\uc815<\/h2>\n<pre><code class=\"language-sql\">update tbl_restaurant_info\nset tel = replace(tel, &#039; &#039;, &#039;-&#039;)\nwhere tel &lt;&gt; replace(tel, &#039; &#039;, &#039;-&#039;);\n\n-- \uc2e4\uc11c\ube44\uc2a4\uc5d0\uc11c \uc815\uaddc\uc2dd \uc0ac\uc6a9\ud558\uba74 \ucd1d\uc0b4\ub2f9\ud569\ub2c8\ub2e4.\nupdate tbl_restaurant_info\nset tel = concat(substring(tel, 1, length(tel) -4), &#039;-&#039;, right(tel, 4))\nwhere tel regexp &#039;[0-9]{7,8}$&#039;;\n\n-- 5-10 \ud68c \ubc18\ubcf5\ud560 \uac83.\nupdate tbl_restaurant_info\nset tel = substring(tel, 2, 100)\nwhere tel regexp &#039;^00&#039;;\n\nupdate tbl_restaurant_info r\nset tel = concat(substring(tel, 1, 2), &#039;-&#039;, substring(tel, 3, 100))\nwhere\n    1 = 1\n    and r.tel &lt;&gt; &#039;&#039;\n    and r.tel regexp &#039;^02&#039;\n    and r.tel regexp &#039;^[0-9]{5,6}&#039;;\n\nupdate tbl_restaurant_info r\nset tel = concat(substring(tel, 1, 3), &#039;-&#039;, substring(tel, 4, 100))\nwhere\n    1 = 1\n    and r.tel &lt;&gt; &#039;&#039;\n    and not r.tel regexp &#039;^02&#039;\n    and r.tel regexp &#039;^[0-9]{6,7}&#039;;\n\n-- 2-3 \ud68c \ubc18\ubcf5\ud560 \uac83.\nupdate tbl_restaurant_info r\nset tel = replace(tel, &#039;-00&#039;, &#039;-&#039;)\nwhere\n    1 = 1\n    and r.tel &lt;&gt; &#039;&#039;;\n\nupdate tbl_restaurant_info r\nset tel = replace(tel, &#039;-0&#039;, &#039;-&#039;)\nwhere\n    1 = 1\n    and r.tel &lt;&gt; &#039;&#039;;\n\nupdate tbl_restaurant_info r\nset tel = replace(tel, &#039;--&#039;, &#039;-&#039;)\nwhere\n    1 = 1\n    and r.tel &lt;&gt; &#039;&#039;;<\/code><\/pre>\n<h2>\uc911\ubd80\uc6d0\uc810TM(EPSG:2097) to WGS84(EPSG:4326)<\/h2>\n<pre><code class=\"language-bash\">pip3 install PyMySQL\npip3 install pyproj<\/code><\/pre>\n<pre><code class=\"language-bash\">vi convert.py\n------------------------------\nfrom pyproj import Proj, transform\nimport pymysql\n\nimport warnings\nwarnings.filterwarnings(&#039;ignore&#039;)\n\ndef isfloat(value):\n  try:\n    float(value)\n    return True\n  except ValueError:\n    return False\n\n# Projection \uc815\uc758\n# \uc911\ubd80\uc6d0\uc810(Bessel): \uc11c\uc6b8 \ub4f1 \uc911\ubd80\uc9c0\uc5ed EPSG:2097\nproj_1 = Proj(init=&#039;epsg:2097&#039;)\n\n# WGS84 \uacbd\uc704\ub3c4: GPS\uac00 \uc0ac\uc6a9\ud558\ub294 \uc88c\ud45c\uacc4 EPSG:4326\nproj_2 = Proj(init=&#039;epsg:4326&#039;)\n\nconn = pymysql.connect(\n    user=&#039;mg&#039;,\n    passwd=&#039;********&#039;,\n    host=&#039;127.0.0.1&#039;,\n    db=&#039;mg&#039;,\n    charset=&#039;utf8&#039;\n)\n\ncursor = conn.cursor()\n\nsql_select = &quot;&quot;&quot;\nselect id, x, y from tbl_restaurant_info;\n&quot;&quot;&quot;\n\nsql_update = &quot;&quot;&quot;\nupdate tbl_restaurant_info\nset longitude = %f, latitude = %f\nwhere id = %d;\n&quot;&quot;&quot;\n\ncursor.execute(sql_select)\nresult = cursor.fetchall()\n\nfor row in result:\n    idx, x, y = row[0], row[1], row[2]\n\n    if x != &#039;&#039; and y != &#039;&#039;:\n        if isfloat(x) and isfloat(y):\n            x_, y_ = transform(proj_1, proj_2, x, y)\n            cursor.execute(sql_update % (x_, y_, int(idx)))\n            if idx % 500 == 0:\n                conn.commit()\n                print(&#039;committed : %d&#039; % idx)\n\nconn.commit()\n\nconn.close()\n------------------------------<\/code><\/pre>\n<h2>Send to ElasticSearch<\/h2>\n<pre><code class=\"language-bash\">PUT \/restaurant_info?pretty\n{\n    &quot;settings&quot; : {\n        &quot;number_of_shards&quot; : 1,\n        &quot;number_of_replicas&quot; : 0\n    },\n    &quot;mappings&quot; : {\n        &quot;properties&quot; : {\n            &quot;restaurant_name&quot; : { &quot;type&quot; : &quot;text&quot; },\n            &quot;start_date&quot; : {\n                &quot;type&quot; : &quot;date&quot;,\n                &quot;format&quot;: &quot;yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyyMMdd||epoch_millis&quot;\n            },\n            &quot;tel&quot; : { &quot;type&quot; : &quot;keyword&quot; },\n            &quot;address&quot; : { &quot;type&quot; : &quot;text&quot; },\n            &quot;road_address&quot; : { &quot;type&quot; : &quot;text&quot; },\n            &quot;coordinate&quot; : { &quot;type&quot; : &quot;geo_point&quot; },\n            &quot;longitude&quot; : { &quot;type&quot; : &quot;float&quot; },\n            &quot;latitude&quot; : { &quot;type&quot; : &quot;float&quot; },\n            &quot;modify_time&quot; : {\n                &quot;type&quot; : &quot;date&quot;,\n                &quot;format&quot;: &quot;yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyyMMdd||epoch_millis&quot;\n            },\n            &quot;insert_time&quot; : {\n                &quot;type&quot; : &quot;date&quot;,\n                &quot;format&quot;: &quot;yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||yyyyMMdd||epoch_millis&quot;\n            }\n        }\n    }\n}<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\uba38\uae00\ub7ad \uac00\uc774\ub4dc the beginning \ucc38\uc870 \ucc38\uc870 \ucc38\uc870 \ucc38\uc870 \ub370\uc774\ud0c0 \uad6c\ud558\uae30 https:\/\/www.localdata.kr\/ import to MariaDB file -bi data.csv application\/csv; charset=unknown-8bit iconv -c -f euc-kr -t utf8 data.csv &gt; data-utf8.csv head -20 data-utf8.csv mysql -u root -p create user &#039;\uc0ac\uc6a9\uc790&#039;@&#039;localhost&#039; identified by &#039;\ube44\ubc00\ubc88\ud638&#039;; create database DB\uc774\ub984; grant all privileges on DB\uc774\ub984.* to &#039;\uc0ac\uc6a9\uc790&#039;@&#039;localhost&#039;; flush privileges; use DB\uc774\ub984;\u2026 <span class=\"read-more\"><a href=\"https:\/\/www.skyer9.pe.kr\/wordpress\/?p=3480\">Read More &raquo;<\/a><\/span><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"class_list":["post-3480","post","type-post","status-publish","format-standard","hentry","category-uncategorized"],"_links":{"self":[{"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=\/wp\/v2\/posts\/3480","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=3480"}],"version-history":[{"count":34,"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=\/wp\/v2\/posts\/3480\/revisions"}],"predecessor-version":[{"id":3598,"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=\/wp\/v2\/posts\/3480\/revisions\/3598"}],"wp:attachment":[{"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=3480"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=3480"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.skyer9.pe.kr\/wordpress\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=3480"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}