Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.metadata
48 changes: 48 additions & 0 deletions 221801132/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
WordCount
---------------


**基本功能**

假设有一个软件每隔一小段时间会记录一次用户的搜索记录,记录为英文。
输入文件和输出文件以命令行参数传入。例如我们在命令行窗口(cmd)中输入:


>java WordCount input.txt output.txt

则会统计input.txt中的以下几个指标

**1、统计文件的字符数(对应输出第一行):**

- 只需要统计Ascii码,汉字不需考虑
- 空格,水平制表符,换行符,均算字符



**2、统计文件的单词总数(对应输出第二行),单词:至少以4个英文字母开头,跟上字母数字符号,单词以分隔符分割,不区分大小写。**

- 英文字母: A-Z,a-z
- 字母数字符号:A-Z, a-z,0-9
- 分割符:空格,非字母数字符号
- 例:file123是一个单词, 123file不是一个单词。file,File和FILE是同一个单词



**3、统计文件的有效行数(对应输出第三行):任何包含非空白字符的行,都需要统计。**

**4、统计文件中各单词的出现次数(对应输出接下来10行),最终只输出频率最高的10个。**

- 频率相同的单词,优先输出字典序靠前的单词。

>例如,windows95,windows98和windows2000同时出现时,则先输出windows2000

- 输出的单词统一为小写格式

**然后将统计结果输出到output.txt,输出的格式如下;其中word1和word2 对应具体的单词,number为统计出的个数;换行使用'\n',编码统一使用UTF-8。**

>characters: number
words: number
lines: number
word1: number
word2: number
...
51 changes: 51 additions & 0 deletions 221801132/codestyle.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
代码规范
----


----------

**缩进**

- 4个空格

**每行最多字符数**

- 80字符

**函数最大行数**

- 100行


**函数、类命名**

- 类名使用UpperCamelCase风格,必须遵从驼峰形式。
- 命名尽量使用英文单词,力求简单清楚

**变量命名**

- 且尽量使用单词命名,一律小写
- 禁止取单个字符(如i、j、k),但 i、j、k作局部循环变量是允许的。

**常量**

- 全部大写
- 不允许未经定义的常量直接出现在代码中

**空行规则**

- 相对独立的程序块之间、变量说明之后必须加空行
- 不允许把多个短语句写在一行中,一行只写一条语句
- if、for、do、while、case、switch、default 等语句自占一行,且if、for、do、while等语句的执行语句部分无论多少都要加括号{}。

**注释规则**

- 使用//
- 注释的内容要清楚、明了,不能有二义性
- 操作符前后空格 操作符前后必须加一个空格

**其他规则**

- 严禁使用拼音与英文混合的方式,更不允许直接使用中文的方式
- 用大写的’L’代替’l’

173 changes: 173 additions & 0 deletions 221801132/src/Lib.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package WordCount;

import java.io.*;
import java.util.*;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class Lib {
public static Reader InputFile(String fileName) {
File file = new File(fileName);
Reader reader = null;
try {
reader = new InputStreamReader(new FileInputStream(file));
} catch (FileNotFoundException e) {
System.out.println("找不到输入文件!");
}
return reader;
}

public static BufferedWriter OutputFile(String fileName) throws IOException {
BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(fileName),true),"utf-8"));
return write;
}

public static int CountCharacters(String InputFile, String OutputFile) throws IOException {
Reader reader = InputFile(InputFile);
Writer writer = new FileWriter(OutputFile);
int CharactersNum = 0; //文件字符数
while (reader.read() != -1) //读取到-1时停止。
{
CharactersNum++;
}
writer.write("characters:" + CharactersNum + '\n');
writer.close();
reader.close();
return CharactersNum;
} // 统计字符数。

public static int CountWords(String inputFile, String outputFile) throws IOException {
Reader reader = InputFile(inputFile);
Writer writer = OutputFile(outputFile);
int length; //表示单词的长度,大于等于4合法
int temp;
int WordsNum = 0;
String word = "";
String regex = "[a-zA-Z]{4}[^ ,.]+"; //正则表达式判断是否为四个英文开头
Pattern p = Pattern.compile(regex);
while ((temp = reader.read()) != -1)
{
while ((temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) {
word += (char) temp;
temp = reader.read();
}
while ((!(temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) && temp != -1) //去除空白字符和分隔符
{
temp = reader.read();
}
Matcher m = p.matcher(word);
length = word.length();
if (length >= 4 && m.matches())
{
WordsNum++;
}
word = "" + (char)temp;
}
writer.append("words: " + WordsNum + '\n');
writer.close();
reader.close();
return WordsNum;
} //统计单词数


public static int CountLines(String inputFile, String outputFile) throws IOException {
Reader reader = InputFile(inputFile);
Writer writer = OutputFile(outputFile);
int temp;
int LinesNum = 0;
String line = "";
while ((temp = reader.read()) != -1)
{
while (temp != -1 && (char) temp != '\n')
{
if (temp != ' ' && temp != '\t' && temp != '\r')
{
line += (char)temp;
}
temp = reader.read();
}
if (line != " ")
{
LinesNum++;
}
line = " ";
}
writer.append("lines:" + LinesNum + "\n");
reader.close();
writer.close();
return LinesNum;
} //统计行数。

public static String WordsNumSort(String inputFile, String outputFile) throws IOException {
Reader reader = InputFile(inputFile);
Writer writer = OutputFile(outputFile);
int temp;
String word = "";
String regex = "[a-zA-Z]{4}[^ ,.]+"; //正则表达式判断是否为四个英文开头
Pattern p = Pattern.compile(regex);
Map<String, Integer> words = new HashMap<String, Integer>();
while ((temp = reader.read()) != -1)
{
while ((temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57))
{
if (temp >= 65 && temp <= 90)
{
temp += 32;
}
word += (char)temp;
temp = reader.read();
}
while ((!(temp >= 97 && temp <= 122) || (temp >= 65 && temp <= 90) || (temp >= 48 && temp <= 57)) && temp != -1)
{
temp = reader.read();
}
Matcher m = p.matcher(word);
if (m.matches()) {
if (words.get(word) == null)
{
words.put(word, Integer.valueOf(1));
}
else
{
words.put(word, Integer.valueOf(words.get(word).intValue() + 1));
}
}
if (temp >= 65 && temp <= 90)
{
temp += 32;
}
word = "" + (char) temp;
} //与统计单词数的方法类似,不合法的单词不进行排序。

Map<String, Integer> WordsSort = words.entrySet().stream().sorted(new Comparator<Map.Entry<String, Integer>>()
{
public int compare(Map.Entry<String, Integer> w1, Map.Entry<String, Integer> w2)
{
if (w1.getValue().equals(w2.getValue()))
{
return w1.getKey().compareTo(w2.getKey());
}
else
{
return w2.getValue().compareTo(w1.getValue());
}
}
}
).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,(oldValue, newValue) -> oldValue, LinkedHashMap::new)); //对单词频率进行排序
String test = null; //设定频率最低的词以便单元测试
int i = 0;
for (Map.Entry<String, Integer> entry : WordsSort.entrySet())
{
test = entry.getKey();
writer.write(entry.getKey() + ":" + entry.getValue() + "\n");
if (i++ >= 9) {
break;
}
} ////打印频率前十的单词
reader.close();
writer.close();
return test; //返回频率最低的词
}
}
24 changes: 24 additions & 0 deletions 221801132/src/WordCount.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package WordCount;

import java.io.*;
import java.util.*;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class WordCount {
public static void main(String[] args) throws IOException {
if (args.length != 2)
{
System.out.println("命令行参数错误,需要两个文件名!");
System.exit(0);
}
String inputFile = args[0];
String outputFile = args[1];
Lib.CountCharacters(inputFile, outputFile);
Lib.CountWords(inputFile, outputFile);
Lib.CountLines(inputFile, outputFile);
Lib.WordsNumSort(inputFile, outputFile);
}
}