Python function production to obtain the corresponding data of all published articles on CSDN

2024-07-12

Hello everyone, today I want to share a practicalPython Script, which can help you batch obtain the relevant data of all published articles on the CSDN blog and save the data into an Excel file. In addition, the script will obtain a quality score for each article and record this score in Excel as well. Let's get started!

Script functionality overview

This script is mainly divided into two parts:

Get article information and save to Excel：This part will get your article list from CSDN API and save the key information into an Excel file.
Get article quality score and update Excel: This part will request a quality score for each article and add the score to the corresponding Excel file.

Implementation steps

1. Import necessary libraries

First, we need to import some Python libraries to help us with this task:


import json
import pandas as pd
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import math
import requests

2. Define and save article informationExcelthe type

We defined a classGetInformationToExcelTo handle the acquisition of article information and the saving of Excel files:


class GetInformationToExcel:
    def __init__(self, username, cookies, Referer, page, size, filename):
        self.username = username
        self.cookies = cookies
        self.Referer = Referer
        self.size = size
        self.filename = filename
        self.page = page
 
    # 发送HTTP GET请求到CSDN的API，获取文章列表
    def get_articles(self):
        url = "https://blog.csdn.net/community/home-api/v1/get-business-list"
        params = {
            "page": {self.page},
            "size": {self.size},
            "businessType": "blog",
            "username": {self.username}
        }
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'Cookie': self.cookies,
            'Referer': self.Referer
        }
 
        try:
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            return data.get('data', {}).get('list', [])
        except requests.exceptions.HTTPError as e:
            print(f"HTTP错误: {e.response.status_code} {e.response.reason}")
        except requests.exceptions.RequestException as e:
            print(f"请求异常: {e}")
        except json.JSONDecodeError:
            print("解析JSON失败")
        return []
 
    # 将文章列表转换为Pandas DataFrame,选择并重命名必要的列。
    def export_to_excel(self):
        df = pd.DataFrame(self.get_articles())
        df = df[['title', 'url', 'postTime', 'viewCount', 'collectCount', 'diggCount', 'commentCount']]
        df.columns = ['文章标题', 'URL', '发布时间', '阅读量', '收藏量', '点赞量', '评论量']
        wb = Workbook()
        sheet = wb.active
        for r in dataframe_to_rows(df, index=False, header=True):
            sheet.append(r)
        for column in sheet.columns:
            max_length = 0
            column = [cell for cell in column]
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(cell.value)
                except:
                    pass
            adjusted_width = (max_length + 5)
            sheet.column_dimensions[column[0].column_letter].width = adjusted_width
        # Save the workbook
        wb.save(self.filename)

In this class, we implemented the following methods:

__init__: Initialization method, setting the basic properties of the class.
get_articles: Send an HTTP GET request to the CSDN API to get a list of articles.
export_to_excel: Convert the article list to a Pandas DataFrame and save it to an Excel file.

3. Define the class for obtaining article quality scores

Next, we define another classGetArticleScoresTo handle the acquisition of article quality scores and the update of Excel files:


class GetArticleScores:
    def __init__(self, filepath):
        self.filepath = filepath
 
    # 发送HTTP POST请求到一个API，获取文章的质量分。
    @staticmethod
    def get_article_score(article_url):
        url = "https://bizapi.csdn.net/trends/api/v1/get-article-score"
        headers = {
            "Accept": "application/json, text/plain, */*",
            "X-Ca-Key": "203930474",
            "X-Ca-Nonce": "b35e1821-05c2-458d-adae-3b720bb15fdf",
            "X-Ca-Signature": "gjeSiKTRCh8aDv0UwThIVRITc/JtGJkgkZoLVeA6sWo=",
            "X-Ca-Signature-Headers": "x-ca-key,x-ca-nonce",
            "X-Ca-Signed-Content-Type": "multipart/form-data",
        }
        data = {"url": article_url}
        try:
            response = requests.post(url, headers=headers, data=data)
            response.raise_for_status()  # This will raise an error for bad responses
            return response.json().get('data', {}).get('score', 'Score not found')
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            return "Error fetching score"
 
    def get_scores_from_excel(self):
        """读取Excel文件，获取文章URL列表。
            对每个URL调用 get_article_score 方法，获取分数列表。
            返回分数列表。"""
        df = pd.read_excel(self.filepath)
        urls = df['URL'].tolist()
        scores = [self.get_article_score(url) for url in urls]
        return scores
 
    def write_scores_to_excel(self):
        """读取Excel文件到DataFrame。
            将获取的分数添加到DataFrame中。
            将更新后的DataFrame保存回Excel文件。"""
        df = pd.read_excel(self.filepath)
        df['质量分'] = self.get_scores_from_excel()
        df.to_excel(self.filepath, index=False)

In this class, we implemented the following methods:

__init__: Initialization method, setting the basic properties of the class.
get_article_score: Static method that sends an HTTP POST request to an API to obtain the quality score of the article.
get_scores_from_excel: Read the Excel file, get the article URL list, and get the score list.
write_scores_to_excel: Read the Excel file into a DataFrame, add the obtained scores to the DataFrame, and save it back to the Excel file.

4. Main program

Finally, we set the total number of articles, cookies, Referer, and CSDN user ID in the main program and performed the following steps:

Calculate the number of pages that need to be requested.
Loop through each page of articles, create an Excel file, and get the quality score and write it into Excel.


if __name__ == '__main__':
    # 请填写:已发文章总数量,cookies,你的首页Referer，你的id：CSDNid
    total = 145
    cookies = 'uuid_tt_dd=10'  # Simplified for brevity
    Referer = 'https://blog.csdn.net/q244645787'
    CSDNid = 'q244645787'
    # 下面是计算和获取
    t_index = math.ceil(total / 100) + 1  # 向上取整，半闭半开区间，开区间+1。
    for index in range(1, t_index):  # 文章总数
        filename = "score" + str(index) + ".xlsx"
        exporter_excel = GetInformationToExcel(CSDNid, cookies, Referer, index, 100, filename)  # Replace with your username
        exporter_excel.export_to_excel()
        article_score = GetArticleScores(filename)
        article_score.write_scores_to_excel()
 
    print("获取完成")

After the execution is completed, you will get an Excel file containing all article data and quality scores.

All code:


import json
import pandas as pd
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import math
import requests
 
 
# 批量获取文章信息并保存到excel
class GetInformationToExcel:
    def __init__(self, username, cookies, Referer, page, size, filename):
        self.username = username
        self.cookies = cookies
        self.Referer = Referer
        self.size = size
        self.filename = filename
        self.page = page
 
    # 发送HTTP GET请求到CSDN的API，获取文章列表
    def get_articles(self):
        url = "https://blog.csdn.net/community/home-api/v1/get-business-list"
        params = {
            "page": {self.page},
            "size": {self.size},
            "businessType": "blog",
            "username": {self.username}
        }
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
            'Cookie': self.cookies,
            'Referer': self.Referer
        }
 
        try:
            response = requests.get(url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            return data.get('data', {}).get('list', [])
        except requests.exceptions.HTTPError as e:
            print(f"HTTP错误: {e.response.status_code} {e.response.reason}")
        except requests.exceptions.RequestException as e:
            print(f"请求异常: {e}")
        except json.JSONDecodeError:
            print("解析JSON失败")
        return []
 
    # 将文章列表转换为Pandas DataFrame,选择并重命名必要的列。
    def export_to_excel(self):
        df = pd.DataFrame(self.get_articles())
        df = df[['title', 'url', 'postTime', 'viewCount', 'collectCount', 'diggCount', 'commentCount']]
        df.columns = ['文章标题', 'URL', '发布时间', '阅读量', '收藏量', '点赞量', '评论量']
        wb = Workbook()
        sheet = wb.active
        for r in dataframe_to_rows(df, index=False, header=True):
            sheet.append(r)
        for column in sheet.columns:
            max_length = 0
            column = [cell for cell in column]
            for cell in column:
                try:
                    if len(str(cell.value)) > max_length:
                        max_length = len(cell.value)
                except:
                    pass
            adjusted_width = (max_length + 5)
            sheet.column_dimensions[column[0].column_letter].width = adjusted_width
        # Save the workbook
        wb.save(self.filename)
 
 
# 获取每篇文章的质量分，并将分数写入到Excel文件中
class GetArticleScores:
    def __init__(self, filepath):
        self.filepath = filepath
 
    # 发送HTTP POST请求到一个API，获取文章的质量分。
    @staticmethod
    def get_article_score(article_url):
        url = "https://bizapi.csdn.net/trends/api/v1/get-article-score"
        headers = {
            "Accept": "application/json, text/plain, */*",
            "X-Ca-Key": "203930474",
            "X-Ca-Nonce": "b35e1821-05c2-458d-adae-3b720bb15fdf",
            "X-Ca-Signature": "gjeSiKTRCh8aDv0UwThIVRITc/JtGJkgkZoLVeA6sWo=",
            "X-Ca-Signature-Headers": "x-ca-key,x-ca-nonce",
            "X-Ca-Signed-Content-Type": "multipart/form-data",
        }
        data = {"url": article_url}
        try:
            response = requests.post(url, headers=headers, data=data)
            response.raise_for_status()  # This will raise an error for bad responses
            return response.json().get('data', {}).get('score', 'Score not found')
        except requests.RequestException as e:
            print(f"Request failed: {e}")
            return "Error fetching score"
 
    def get_scores_from_excel(self):
        """读取Excel文件，获取文章URL列表。
            对每个URL调用 get_article_score 方法，获取分数列表。
            返回分数列表。"""
        df = pd.read_excel(self.filepath)
        urls = df['URL'].tolist()
        scores = [self.get_article_score(url) for url in urls]
        return scores
 
    def write_scores_to_excel(self):
        """读取Excel文件到DataFrame。
            将获取的分数添加到DataFrame中。
            将更新后的DataFrame保存回Excel文件。"""
        df = pd.read_excel(self.filepath)
        df['质量分'] = self.get_scores_from_excel()
        df.to_excel(self.filepath, index=False)
 
 
if __name__ == '__main__':
    # 请填写:已发文章总数量,cookies,你的首页Referer，你的id：CSDNid
    total = 145
    cookies = 'uuid_tt_dd=10'  # Simplified for brevity
    Referer = 'https://blog.csdn.net/q244645787'
    CSDNid = 'q244645787'
    # 下面是计算和获取
    t_index = math.ceil(total / 100) + 1  # 向上取整，半闭半开区间，开区间+1。
    for index in range(1, t_index):  # 文章总数
        filename = "score" + str(index) + ".xlsx"
        exporter_excel = GetInformationToExcel(CSDNid, cookies, Referer, index, 100, filename)  # Replace with your username
        exporter_excel.export_to_excel()
        article_score = GetArticleScores(filename)
        article_score.write_scores_to_excel()
 
    print("获取完成")

Technology Sharing