import requests
from bs4 import BeautifulSoup
import time
import sys
import re
import pandas as pd
import random
df_rate = pd.read_excel('gmu_rate.xlsx')
df_rate.head()
links = []
for i in range(0, len(df_rate)):
site = df_rate['Profile Link'].iloc[i]
links.append(site)
len(links)
rating = {'๐awesome':'๐', '๐awful':'๐','๐average':'๐' }
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'content-type': 'application/xhtml+xml'}
page = 'https://www.ratemyprofessors.com/search.jsp?query=a&queryoption=HEADER&stateselect=&country=united+states&dept=&queryBy=teacherName&facetSearch=true&schoolName=&offset='
y = 0
links = []
for i in range(0, 884):
x = '&max=20'
y = y + 20
y1 = str(y) + x
page1 = page + str(y1)
#print(page1)
links.append(page1)
len(links)
site = 'https://www.ratemyprofessors.com/search.jsp?query=a&queryoption=HEADER&stateselect=&country=united+states&dept=&queryBy=teacherName&facetSearch=true&schoolName=&offset=20&max=20'
website = []
count = len(links)
for link in links:
count = count - 1
print(count)
time.sleep(1)
source = requests.get(link, headers = headers)
soup = BeautifulSoup(source.text, 'lxml')
content = soup.find(class_="listings")
try:
for job in content.find_all('li'):
job_ = job.text.strip()
#print(job)
for a in job.find_all('a', href=True):
href = a['href']
#print ("Found the URL:", a['href'])
#print(href)
website.append(href)
except:
print(link)
len(website)
df = pd.DataFrame({'col':website})
df.to_excel('_RATE_PROF_ID_A.xlsx', encoding='utf-8', index=False)
17680 // 20
x = 0
for i in range(0, 884):
x = x + 20
print(x)
df
## link to website list
# MAIN FUNCTION
dfcols = ['ID', 'value', 'quality', 'difficulty', 'grade', 'comment', 'link']
df2 = pd.DataFrame(columns=dfcols)
count = len(website)
value = ''
Q = ''
D = ''
Q_ = ''
rate = ''
grade = ''
comment = ''
site_ = 'https://www.ratemyprofessors.com/'
for site in website:
df2.to_excel('test.xlsx', encoding='utf-8', index=False)
print(count)
#print(profile)
#print('\n')
count = count - 1
link = site_ + site
time.sleep(2)
source = requests.get(link, headers = headers)
soup = BeautifulSoup(source.text, 'lxml')
content = soup.find(id="ratingsList")
ID = site.split('=')[-1]
profile = site
try:
for job in content.find_all('li'):
value = ''
Q = ''
D = ''
Q_ = ''
rate = ''
grade = ''
comment = ''
for value1 in job.find_all('div'):
value = ''
try:
value = value1.text.strip()
#print(value)
# ------ GRADE ---------
if 'Grade:' in value:
if len(value) <= 15:
grade = value
#print(grade)
else:
grade = ''
# Quality and difficulty----------
if 'Quality' in value:
if 'Difficulty' in value:
if len(value) <= 23:
Q_ = value.split('Quality')[-1]
Q = Q_.split('Difficulty')[0] # quality
D = Q_.split('Difficulty')[-1] # Difficulty
else:
D = ''
Q = ''
#print('Quality: ', Q, ' Difficulty: ', D)
# ---- Comment -------
elif '๐' not in value:
if '๐' not in value:
if '๐' not in value:
if len(value) >= 80:
comment = ''
comment = value
#print(value)
#df2 = df2.append(pd.Series([ID, rate, Q, D, grade, comment, profile], index=dfcols), ignore_index=True)
else:
pass
# ---- rating ------
for k, v in rating.items():
if value == k:
rate = k.split(v)[-1]
#print(rate, ' : ', value)
else:
pass
#df2 = df2.append(pd.Series([ID, rate, Q, D, grade, comment, profile], index=dfcols), ignore_index=True)
except:
pass
df2 = df2.append(pd.Series([ID, rate, Q, D, grade, comment, profile], index=dfcols), ignore_index=True)
except:
print('COULD NOT FIND: ', profile)