출처:https://raonlyz.tistory.com/39?category=672918
작성자:김기범,김영일,이건호,최성환
실무프로젝트
import urllib
import re
from collections import OrderedDict
from re import findall, sub
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
# URL 불러오기
url = ("https://raonlyz.tistory.com/39?category=672918")
req = requests.get(url)
req.text
soup = BeautifulSoup(req.text, 'html.parser')
title = soup.select_one("body > div > section > div")
text = title.get_text()
def clean_text(text):
text_re = text.lower()
text_re2 = sub('[0-9]', '', text_re)
text_re3 = sub('[,.?!;:/]', '', text_re2)
text_re4 = sub('[!@#$%^&*_]', '', text_re3)
text_re5 = sub('[a-z]', '', text_re4)
text_re6 = ' '.join(text_re5.split())
return text_re6
text_result = [clean_text(text) for texts in text]
print('>> 텍스트 전처리 결과<<')
print(text_result)
print(type(text_result))
a= ' '.join(OrderedDict.fromkeys(text_result))
print("a",a)
result = ''.join(s for s in a)
okt = Okt()
noun = okt.nouns(result)
count = Counter(noun)
print(noun)
noun_list = count.most_common(100)
for v in noun_list:
print(v)
with open("C:/Users/Foryoucom/Desktop/긍정단어.txt","r",encoding="UTF-8") as f:
example = f.readlines()
print(example)
lst1 = []
for i in example:
lst1.append(i)
#print("긍정", lst1)
lsts_result1 = list(map(lambda s: s.strip(), lst1))
print(lsts_result1)
with open("C:/Users/Foryoucom/Desktop/부정단어.txt","r",encoding="UTF-8") as f:
example2 = f.readlines()
#print(example2)
lst2 = []
for j in example2:
lst2.append(j)
#print(lst2)
lsts_result2 = list(map(lambda s: s.strip(), lst2))
print(lsts_result2)
pos = []
cnt1 = 0
for x in noun:
for y in lsts_result1:
if x == y:
pos.append(x)
cnt1 += 1
print(pos)
print(cnt1)
count_pos = Counter(pos)
print(count_pos)
pos_1 = dict.fromkeys(pos)
pos_2 = list(pos_1)
count_pos1 = str(count_pos)
numbers1 = re.findall("\d", count_pos1)
print(numbers1)
numbers1 = list(map(int, numbers1))
print(numbers1)
neg = []
cnt2 = 0
for m in noun:
for n in lsts_result2:
if m == n:
neg.append(m)
cnt2 += 1
print(neg)
print(cnt2)
count_neg = Counter(neg)
print(count_neg)
neg_1 = dict.fromkeys(neg)
neg_2 = list(neg_1)
count_neg1 = str(count_neg)
numbers2 = re.findall("\d", count_neg1)
print(numbers2)
numbers2 = list(map(int, numbers2))
print(numbers2)
#차트에서 한글지원
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(
fname= 'C:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family = font_name)
plt.bar(pos_2, numbers1)
plt.title('긍정막대 그래프', fontsize=20)
plt.xticks(pos)
plt.show()
plt.bar(neg_2, numbers2)
plt.title('부정막대 그래프', fontsize=20)
plt.xticks(neg)
plt.show()
wc = WordCloud(font_path='malgun', width=400, height=400, scale=2.0, max_font_size=250)
gen = wc.generate_from_frequencies(count_pos)
plt.figure()
plt.imshow(gen)
plt.show()
wc = WordCloud(font_path='malgun', width=400, height=400, scale=2.0, max_font_size=250)
gen1 = wc.generate_from_frequencies(count_neg)
plt.figure()
plt.imshow(gen1)
plt.show()
'포트폴리오' 카테고리의 다른 글
python - numpy패키지 기능(R함수 비교) (0) | 2022.06.28 |
---|---|
python - 파이차트(singer.csv이용) (0) | 2022.06.28 |
python - tensorflow(iris 데이터셋 이용) (0) | 2022.06.28 |
html 2022-06-28 (0) | 2022.06.28 |
R, python 누적 막대 차트 (VADeaths 데이터 이용) (0) | 2022.06.28 |