# 第一层目录 driver.get(base_url) time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') first_level_dirs = [span.find_next_sibling('span', class_='m-l-xs').text.strip() for span in soup.find_all('span', class_='icon') if (span.find('i', class_='fas fa-file-archive') or span.find('i', class_='fas fa-folder'))]
# 保存所有找到的.asset文件的链接 all_asset_links = []
for first_dir in first_level_dirs: driver.get(f"{base_url}/{first_dir}") time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') second_level_dirs = [span.find_next_sibling('span', class_='m-l-xs').text.strip() for span in soup.find_all('span', class_='icon') if (span.find('i', class_='fas fa-file-archive') or span.find('i', class_='fas fa-folder'))]
for second_dir in second_level_dirs: driver.get(f"{base_url}/{first_dir}/{second_dir}") time.sleep(10) soup = BeautifulSoup(driver.page_source, 'html.parser') links = soup.find_all('a', attrs={'download': True}) asset_links = ['https://bestdori.com' + link['href'] for link in links] print(asset_links) all_asset_links.extend(asset_links)
# 下载所有找到的.asset文件 save_path = r'C:\Users\Admin\爬虫\assert' if not os.path.exists(save_path): os.makedirs(save_path)
for link in all_asset_links: response = requests.get(link, stream=True) filename = os.path.join(save_path, link.split('/')[-1])
if response.status_code == 200: with open(filename, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk)
def extract_data(data, current_data=None, output_file=None): if current_data is None: current_data = {}
if isinstance(data, dict): if 'windowDisplayName' in data: current_data['windowDisplayName'] = data['windowDisplayName'] if 'body' in data: # 移除body中的换行符 current_data['body'] = data['body'].replace('\n', '') if 'voiceId' in data: current_data['voiceId'] = data['voiceId'] # 检查所有字段是否非空 valid_data = all(current_data.get(k) for k in ['windowDisplayName', 'body', 'voiceId']) # 检查 windowDisplayName 是否包含 "・" valid_displayname = "・" not in current_data.get('windowDisplayName', "") # 检查 body 是否只包含标点符号 valid_body = bool(re.sub(r'[^\w]', '', current_data.get('body', ""))) # 如果满足所有条件,输出结果到文件 if valid_data and valid_displayname and valid_body: output_file.write(f"{current_data['voiceId']}|{current_data['windowDisplayName']}|{current_data['body']}\n") current_data.clear() # 清空当前数据以供下次使用 for key in data: extract_data(data[key], current_data, output_file) elif isinstance(data, list): for item in data: extract_data(item, current_data, output_file)
# 打开一个txt文件以保存结果 with open("BangDreamSortPath.txt", "w", encoding="utf-8") as output_file: # 遍历所有文件 for filename in files: if filename.endswith(".asset"): file_path = os.path.join(directory, filename) with open(file_path, 'r', encoding='utf-8') as file: content = json.load(file) extract_data(content, output_file=output_file)
# 从 WholeMp3UrlPaths.txt 创建一个映射 path_mapping = {} with open("WholeMp3UrlPaths.txt", "r", encoding="utf-8", errors="ignore") as f: for line in f: try: audio_id = line.strip().split("/")[-1].replace(".mp3", "") path_mapping[audio_id] = line.strip('\n') except Exception as e: print(f"Error processing line {line}: {e}")
# 遍历 BangDreamSortPath.txt 的每一行,并根据需要替换音频名 new_results = [] with open("BangDreamSortPath.txt", "r", encoding="utf-8") as f: for line in f: try: audio_id = line.split("|")[0] if audio_id in path_mapping: line = line.replace(audio_id, path_mapping[audio_id]) new_results.append(line) except Exception as e: print(f"Error processing line {line}: {e}")
# 将新的结果保存到新的 txt 文件中 with open("SortPathUrl.txt", "w", encoding="utf-8") as f: f.writelines(new_results)
# 简单地筛掉无效网址 with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines()
# 只保留以https开头的行 filtered_lines = [line for line in lines if line.startswith('https')]
# 将筛选后的内容写回文件 with open(file_path, 'w', encoding='utf-8') as file: file.writelines(filtered_lines)