英文:
Is this scraper limited to 99 records or is there a bug?
问题
这是一个网页抓取程序。它从一个起始日期抓取记录,起始日期到2023年7月1日。如果我将起始日期设置为5月1日,它会返回预期数量的记录。如果我将其设置为1月1日,它只返回2月8日之后的记录。从2月8日到7月1日,有99条记录。也许存在99条记录的限制。然而,与该限制相反,如果我将URI粘贴到Edge网页浏览器中,我可以获得成千上万条记录。程序中是否有错误?我怀疑没有,但我愿意考虑它可以修复。
internal class Program
{
static async Task Main(string[] args)
{
Console.Beep(); // test
TimeSpan ts;
ts = new DateTime(2023, 1, 1) - new DateTime(1970, 1, 1);
ulong startTime = Convert.ToUInt64(ts.TotalSeconds);
ts = new DateTime(2023, 7, 1) - new DateTime(1970, 1, 1);
ulong endTime = Convert.ToUInt64(ts.TotalSeconds);
var client = new HttpClient();
client.Timeout = TimeSpan.FromMinutes(5);
string uri = $"http://finance.yahoo.com/quote/INTC/history?period1={startTime}&period2={endTime}&interval=1d&filter=history&frequency=1d";
var response = await client.GetAsync(uri);
if (response.IsSuccessStatusCode)
{
string? html = await response.Content.ReadAsStringAsync();
if (html != null)
{
Dictionary<DateTime, float> historicalPrices = ParseHistoricalPrices((string)html);
Console.Beep();
Console.WriteLine($"historical prices count {historicalPrices.Count}");
Console.WriteLine("Press any key to continue.");
Console.ReadKey();
foreach (var kvp in historicalPrices)
Console.WriteLine($"{kvp.Key} {kvp.Value}");
}
else throw new Exception();
}
else throw new Exception();
}
private static Dictionary<DateTime, float> ParseHistoricalPrices(string html)
{
Dictionary<DateTime, float> dictionary = new Dictionary<DateTime, float>();
IElement matchingTable = null;
AngleSharp.IBrowsingContext browsingContext = BrowsingContext.New(AngleSharp.Configuration.Default);
AngleSharp.Dom.IDocument document = browsingContext.OpenAsync(virtualResponse => virtualResponse.Content(html)).Result;
if (document.Title != null)
{
if (document.Title.Contains("Historical Prices"))
{
int matchCount = 0;
var tableCollection = document.GetElementsByTagName("table");
foreach (var table in tableCollection)
{
var dataTestAttribute = table.GetAttribute("data-test");
if (dataTestAttribute == "historical-prices")
{
matchCount++;
matchingTable = table;
}
}
if (matchCount == 1)
{
var tbodyCollection = matchingTable.GetElementsByTagName("tbody");
if (tbodyCollection.Length == 1)
{
var trCollection = tbodyCollection[0].GetElementsByTagName("tr");
foreach (var trElement in trCollection)
{
var tdCollection = trElement.GetElementsByTagName("td");
if (tdCollection.Length == 7) // historical price record
{
string dateString = tdCollection[0].TextContent;
string adjCloseString = tdCollection[5].TextContent;
dateString = dateString.Trim();
adjCloseString = adjCloseString.Trim();
adjCloseString = adjCloseString.Replace(",", ""); // remove commas
DateTime date = DateTime.ParseExact(dateString, "MMM dd, yyyy", CultureInfo.InvariantCulture);
Debug.Assert(date.Kind == DateTimeKind.Unspecified);
Debug.Assert(date.TimeOfDay == TimeSpan.Zero);
float adjClose = (float)Convert.ToDouble(adjCloseString);
if (dictionary.ContainsKey(date))
dictionary.Remove(date);
dictionary.Add(date, adjClose);
}
else if (tdCollection.Length == 2)
{
// ignore
}
else throw new Exception();
}
}
else throw new Exception();
}
else throw new Exception();
}
else throw new Exception();
}
else throw new Exception();
return dictionary;
}
}
英文:
Here's a scraper. It scrapes for records dated from a start date to 2023 July 1. If I set the start date to May 1, it returns the expected quantity of records. If I set it to January 1, it gives me nothing older than February 8. From February 8 to July 1, there are 99 records. Perhaps there is a 99 record limit. However, contrary to that limit, if I paste the URI into Edge web browser, I can get thousands of records. Is there bug in the program? I suspect not, but I am open to the possibility that it can be fixed.
internal class Program
{
static async Task Main(string[] args)
{
Console.Beep(); // test
TimeSpan ts;
ts = new DateTime(2023, 1, 1) - new DateTime(1970, 1, 1);
ulong startTime = Convert.ToUInt64(ts.TotalSeconds);
ts = new DateTime(2023, 7, 1) - new DateTime(1970, 1, 1);
ulong endTime = Convert.ToUInt64(ts.TotalSeconds);
var client = new HttpClient();
client.Timeout = TimeSpan.FromMinutes(5);
string uri = $"http://finance.yahoo.com/quote/INTC/history?period1={startTime}&period2={endTime}&interval=1d&filter=history&frequency=1d";
var response = await client.GetAsync(uri);
if (response.IsSuccessStatusCode)
{
string? html = await response.Content.ReadAsStringAsync();
if (html != null)
{
Dictionary<DateTime, float> historicalPrices = ParseHistoricalPrices((string)html);
Console.Beep();
Console.WriteLine($"historical prices count {historicalPrices.Count}");
Console.WriteLine("Press any key to continue.");
Console.ReadKey();
foreach (var kvp in historicalPrices)
Console.WriteLine($"{kvp.Key} {kvp.Value}");
}
else throw new Exception();
}
else throw new Exception();
}
private static Dictionary<DateTime, float> ParseHistoricalPrices(string html)
{
Dictionary<DateTime, float> dictionary = new Dictionary<DateTime, float>();
IElement matchingTable = null;
AngleSharp.IBrowsingContext browsingContext = BrowsingContext.New(AngleSharp.Configuration.Default);
AngleSharp.Dom.IDocument document = browsingContext.OpenAsync(virtualResponse => virtualResponse.Content(html)).Result;
if (document.Title != null)
{
if (document.Title.Contains("Historical Prices"))
{
int matchCount = 0;
var tableCollection = document.GetElementsByTagName("table");
foreach (var table in tableCollection)
{
var dataTestAttribute = table.GetAttribute("data-test");
if (dataTestAttribute == "historical-prices")
{
matchCount++;
matchingTable = table;
}
}
if (matchCount == 1)
{
var tbodyCollection = matchingTable.GetElementsByTagName("tbody");
if (tbodyCollection.Length == 1)
{
var trCollection = tbodyCollection[0].GetElementsByTagName("tr");
foreach (var trElement in trCollection)
{
var tdCollection = trElement.GetElementsByTagName("td");
if (tdCollection.Length == 7) // historical price record
{
string dateString = tdCollection[0].TextContent;
string adjCloseString = tdCollection[5].TextContent;
dateString = dateString.Trim();
adjCloseString = adjCloseString.Trim();
adjCloseString = adjCloseString.Replace(",", ""); // remove commas
DateTime date = DateTime.ParseExact(dateString, "MMM dd, yyyy", CultureInfo.InvariantCulture);
Debug.Assert(date.Kind == DateTimeKind.Unspecified);
Debug.Assert(date.TimeOfDay == TimeSpan.Zero);
float adjClose = (float)Convert.ToDouble(adjCloseString);
if (dictionary.ContainsKey(date))
dictionary.Remove(date);
dictionary.Add(date, adjClose);
}
else if (tdCollection.Length == 2)
{
// ignore
}
else throw new Exception();
}
}
else throw new Exception();
}
else throw new Exception();
}
else throw new Exception();
}
else throw new Exception();
return dictionary;
}
}
答案1
得分: 2
以下是翻译好的部分:
HTML页面正在使用Javascript进行动态分页,初始加载时只呈现99行。只需使用CSV下载链接 - 它会获取完整的数据。
这对我来说有效:
private static async Task<Dictionary<DateTime, decimal>> GetHistoricalPricesAsync(DateTime start, DateTime end)
{
ulong GetUnixTime(DateTime value) => Convert.ToUInt64((value - new DateTime(1970, 1, 1)).TotalSeconds);
ulong startTime = GetUnixTime(new DateTime(2021, 1, 1));
ulong endTime = GetUnixTime(new DateTime(2021, 2, 1));
string uri = $"https://query1.finance.yahoo.com/v7/finance/download/INTC?period1={startTime}&period2={endTime}&interval=1d&events=history&includeAdjustedClose=true";
using (var client = new HttpClient())
{
string csv = await client.GetStringAsync(uri);
string[] lines = csv.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.None);
return lines.Skip(1).Select(x => x.Split(',')).ToDictionary(x => DateTime.Parse(x[0]), x => decimal.Parse(x[4]));
}
}
Dictionary<DateTime, decimal> output = await GetHistoricalPricesAsync(new DateTime(2021, 1, 1), new DateTime(2021, 2, 1));
这给了我:
对于较长的时间跨度也可以正常工作。
英文:
The HTML page is paging dynamically using Javascript and is only rendering 99 rows on the initial load. Just use the CSV download URL - it pulls the full data.
This works for me:
private static async Task<Dictionary<DateTime, decimal>> GetHistoricalPricesAsync(DateTime start, DateTime end)
{
ulong GetUnixTime(DateTime value) => Convert.ToUInt64((value - new DateTime(1970, 1, 1)).TotalSeconds);
ulong startTime = GetUnixTime(new DateTime(2021, 1, 1));
ulong endTime = GetUnixTime(new DateTime(2021, 2, 1));
string uri = $"https://query1.finance.yahoo.com/v7/finance/download/INTC?period1={startTime}&period2={endTime}&interval=1d&events=history&includeAdjustedClose=true";
using (var client = new HttpClient())
{
string csv = await client.GetStringAsync(uri);
string[] lines = csv.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.None);
return lines.Skip(1).Select(x => x.Split(',')).ToDictionary(x => DateTime.Parse(x[0]), x => decimal.Parse(x[4]));
}
}
I can call it like this:
Dictionary<DateTime, decimal> output
= await GetHistoricalPricesAsync(new DateTime(2021, 1, 1), new DateTime(2021, 2, 1));
That gives me:
It works fine for a longer time span.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论