포트폴리오
python - 블로그로 알아보는 성향분석(웹 크롤링)
leegunho
2022. 7. 28. 01:07
출처:https://raonlyz.tistory.com/39?category=672918
작성자:김기범,김영일,이건호,최성환
실무프로젝트
import urllib
import re
from collections import OrderedDict
from re import findall, sub
from konlpy.tag import Okt
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
# URL 불러오기
url = ("https://raonlyz.tistory.com/39?category=672918")
req = requests.get(url)
req.text
soup = BeautifulSoup(req.text, 'html.parser')
title = soup.select_one("body > div > section > div")
text = title.get_text()
def clean_text(text):
text_re = text.lower()
text_re2 = sub('[0-9]', '', text_re)
text_re3 = sub('[,.?!;:/]', '', text_re2)
text_re4 = sub('[!@#$%^&*_]', '', text_re3)
text_re5 = sub('[a-z]', '', text_re4)
text_re6 = ' '.join(text_re5.split())
return text_re6
text_result = [clean_text(text) for texts in text]
print('>> 텍스트 전처리 결과<<')
print(text_result)
print(type(text_result))
a= ' '.join(OrderedDict.fromkeys(text_result))
print("a",a)
result = ''.join(s for s in a)
okt = Okt()
noun = okt.nouns(result)
count = Counter(noun)
print(noun)
noun_list = count.most_common(100)
for v in noun_list:
print(v)
with open("C:/Users/Foryoucom/Desktop/긍정단어.txt","r",encoding="UTF-8") as f:
example = f.readlines()
print(example)
lst1 = []
for i in example:
lst1.append(i)
#print("긍정", lst1)
lsts_result1 = list(map(lambda s: s.strip(), lst1))
print(lsts_result1)
with open("C:/Users/Foryoucom/Desktop/부정단어.txt","r",encoding="UTF-8") as f:
example2 = f.readlines()
#print(example2)
lst2 = []
for j in example2:
lst2.append(j)
#print(lst2)
lsts_result2 = list(map(lambda s: s.strip(), lst2))
print(lsts_result2)
pos = []
cnt1 = 0
for x in noun:
for y in lsts_result1:
if x == y:
pos.append(x)
cnt1 += 1
print(pos)
print(cnt1)
count_pos = Counter(pos)
print(count_pos)
pos_1 = dict.fromkeys(pos)
pos_2 = list(pos_1)
count_pos1 = str(count_pos)
numbers1 = re.findall("\d", count_pos1)
print(numbers1)
numbers1 = list(map(int, numbers1))
print(numbers1)
neg = []
cnt2 = 0
for m in noun:
for n in lsts_result2:
if m == n:
neg.append(m)
cnt2 += 1
print(neg)
print(cnt2)
count_neg = Counter(neg)
print(count_neg)
neg_1 = dict.fromkeys(neg)
neg_2 = list(neg_1)
count_neg1 = str(count_neg)
numbers2 = re.findall("\d", count_neg1)
print(numbers2)
numbers2 = list(map(int, numbers2))
print(numbers2)
#차트에서 한글지원
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(
fname= 'C:/Windows/Fonts/malgun.ttf').get_name()
rc('font', family = font_name)
plt.bar(pos_2, numbers1)
plt.title('긍정막대 그래프', fontsize=20)
plt.xticks(pos)
plt.show()
plt.bar(neg_2, numbers2)
plt.title('부정막대 그래프', fontsize=20)
plt.xticks(neg)
plt.show()
wc = WordCloud(font_path='malgun', width=400, height=400, scale=2.0, max_font_size=250)
gen = wc.generate_from_frequencies(count_pos)
plt.figure()
plt.imshow(gen)
plt.show()
wc = WordCloud(font_path='malgun', width=400, height=400, scale=2.0, max_font_size=250)
gen1 = wc.generate_from_frequencies(count_neg)
plt.figure()
plt.imshow(gen1)
plt.show()